import sys
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from collections import Counter
import random
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding, TSNE
from adjustText import adjust_text
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import DBSCAN
from xgboost import XGBClassifier
import folium
# Utility function for graphing
def plot_cols(dataframe, data_cols, subplot_columns, graph_type, **kwargs):
# If input for subplot_columns is not valid pick the best option
gcd = np.gcd(len(data_cols), subplot_columns)
if gcd == 1:
num_columns = subplot_columns
else:
num_columns = gcd
# Assign columns to a subplot
plot_type = getattr(sns, graph_type)
matrix = np.array(data_cols).reshape(-1, num_columns)
rows, columns = matrix.shape
# Dynamically change figsize based on # of rows/columns
height_per_row = 5 # You can adjust this value depending on how much space each plot needs
width_per_column = 5 # You can adjust this value for plot width
figsize = (width_per_column * columns, height_per_row * rows)
fig, axs = plt.subplots(rows, columns, figsize=figsize)
for i, ax in enumerate(axs.flat):
if i < len(data_cols):
col = data_cols[i]
plot_type(data=dataframe, x=col, ax=ax, **kwargs)
skewness = np.mean((dataframe[col] - np.mean(dataframe[col])) **3 ) / np.std(dataframe[col])**3
if skewness > 1:
ax.set_xscale('log')
ax.set_title(col)
else:
ax.axis('off')
plt.tight_layout(pad=5)
plt.show()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_columns', 300)
Data Preprocessing¶
file_path = 'C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/SmartLocationDatabaseV3/SmartLocationDatabase.gdb'
print(gpd.list_layers(file_path))
# Select only California Data
sql = f"""
SELECT * FROM "EPA_SLD_Database_V3"
WHERE STATEFP = '06'
"""
california_data = gpd.read_file(file_path, engine="pyogrio", sql=sql)
# Write to CSV
#california_data.to_csv('C:/Users/rdn91/Downloads/WalkabilityIndex/datatable.csv')
# Write to GeoJSON
#raw_df.to_file('/path/to/out_geojson.geojson', driver='GeoJSON')
name geometry_type 0 EPA_SLD_Database_V3 MultiPolygon
california_data.head() #geoid --> census block group
| GEOID10 | GEOID20 | STATEFP | COUNTYFP | TRACTCE | BLKGRPCE | CSA | CSA_Name | CBSA | CBSA_Name | CBSA_POP | CBSA_EMP | CBSA_WRK | Ac_Total | Ac_Water | Ac_Land | Ac_Unpr | TotPop | CountHU | HH | P_WrkAge | AutoOwn0 | Pct_AO0 | AutoOwn1 | Pct_AO1 | AutoOwn2p | Pct_AO2p | Workers | R_LowWageWk | R_MedWageWk | R_HiWageWk | R_PCTLOWWAGE | TotEmp | E5_Ret | E5_Off | E5_Ind | E5_Svc | E5_Ent | E8_Ret | E8_off | E8_Ind | E8_Svc | E8_Ent | E8_Ed | E8_Hlth | E8_Pub | E_LowWageWk | E_MedWageWk | E_HiWageWk | E_PctLowWage | D1A | D1B | D1C | D1C5_RET | D1C5_OFF | D1C5_IND | D1C5_SVC | D1C5_ENT | D1C8_RET | D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_ENT | D1C8_ED | D1C8_HLTH | D1C8_PUB | D1D | D1_FLAG | D2A_JPHH | D2B_E5MIX | D2B_E5MIXA | D2B_E8MIX | D2B_E8MIXA | D2A_EPHHM | D2C_TRPMX1 | D2C_TRPMX2 | D2C_TRIPEQ | D2R_JOBPOP | D2R_WRKEMP | D2A_WRKEMP | D2C_WREMLX | D3A | D3AAO | D3AMM | D3APO | D3B | D3BAO | D3BMM3 | D3BMM4 | D3BPO3 | D3BPO4 | D4A | D4B025 | D4B050 | D4C | D4D | D4E | D5AR | D5AE | D5BR | D5BE | D5CR | D5CRI | D5CE | D5CEI | D5DR | D5DRI | D5DE | D5DEI | D2A_Ranked | D2B_Ranked | D3B_Ranked | D4A_Ranked | NatWalkInd | Region | Households | Workers_1 | Residents | Drivers | Vehicles | White | Male | Lowwage | Medwage | Highwage | W_P_Lowwage | W_P_Medwage | W_P_Highwage | GasPrice | logd1a | logd1c | logd3aao | logd3apo | d4bo25 | d5dei_1 | logd4d | UPTpercap | B_C_constant | B_C_male | B_C_ld1c | B_C_drvmveh | B_C_ld1a | B_C_ld3apo | B_C_inc1 | B_C_gasp | B_N_constant | B_N_inc2 | B_N_inc3 | B_N_white | B_N_male | B_N_drvmveh | B_N_gasp | B_N_ld1a | B_N_ld1c | B_N_ld3aao | B_N_ld3apo | B_N_d4bo25 | B_N_d5dei | B_N_UPTpc | C_R_Households | C_R_Pop | C_R_Workers | C_R_Drivers | C_R_Vehicles | C_R_White | C_R_Male | C_R_Lowwage | C_R_Medwage | C_R_Highwage | C_R_DrmV | NonCom_VMT_Per_Worker | Com_VMT_Per_Worker | VMT_per_worker | VMT_tot_min | VMT_tot_max | VMT_tot_avg | GHG_per_worker | Annual_GHG | SLC_score | Shape_Length | Shape_Area | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 060530111013 | 060530111013 | 06 | 053 | 011101 | 3 | None | None | 41500 | Salinas, CA | 433212.0 | 171278.0 | 168077.0 | 346.017589 | 0.112678 | 345.904912 | 339.58397 | 3226 | 743 | 712 | 0.522 | 80 | 0.112360 | 191 | 0.268258 | 441 | 0.619382 | 1157 | 247 | 479 | 431 | 0.213483 | 1894 | 122 | 21 | 1348 | 156 | 247 | 122 | 21 | 1348 | 42 | 247 | 0 | 114 | 0 | 1051 | 663 | 180 | 0.554910 | 2.187971 | 9.499859 | 5.577413 | 0.359263 | 0.061840 | 3.969563 | 0.459386 | 0.727361 | 0.359263 | 0.061840 | 3.969563 | 0.123681 | 0.727361 | 0.000000 | 0.335705 | 0.000000 | 7.765384 | 0.0 | 2.660112 | 0.583986 | 0.583986 | 0.551339 | 0.475063 | 0.708537 | 0.825457 | 0.800664 | 0.535121 | 0.739844 | 0.758440 | 0.610876 | 6.776505e-01 | 16.931275 | 2.424792 | 1.102102 | 13.404381 | 88.229565 | 3.700439 | 3.700439 | 1.850219 | 103.612290 | 14.801756 | 653.13 | 0.0 | 0.0 | -99999.00 | -99999.000000 | -99999.000000 | 7803.0 | 9168.0 | 19967.0 | 12233.0 | 0.001746 | 0.194196 | 0.002093 | 0.250205 | 0.008100 | 0.559723 | 0.004415 | 0.307138 | 16.0 | 7.0 | 13.0 | 14.0 | 12.833333 | Salinas, CA Metro Area | 684 | 1157 | 3177 | 1942.16 | 1326.0 | 141 | 1710 | 247 | 479 | 431 | 0.554910 | 0.350053 | 0.095037 | 342 | 1.159385 | 1.883641 | 1.231041 | 2.667532 | 0 | 0 | 0 | 10 | 3.257603 | -0.024294 | 0.035214 | -0.328330 | -0.051495 | -0.375423 | -0.275634 | 0.001028 | 2.336001 | 0.054826 | 0.066514 | -0.187630 | 0.090756 | -0.257335 | -0.000358 | -0.206061 | -0.206061 | 0.103747 | -0.177292 | -0.198867 | -0.115549 | -0.000988 | 127155 | 433410 | 168080 | 289744.40 | 249520 | 0.299342 | 0.509912 | 0.218557 | 0.341141 | 0.440284 | 0.316341 | 3.597743 | 11.637535 | 15.235278 | 9.558585 | 44.831893 | 19.442039 | 13.574633 | 3529.404532 | 83.906547 | 5061.505510 | 1.400314e+06 | MULTIPOLYGON (((-2223551.553 1784238.168, -222... |
| 1 | 060530111022 | 060530111022 | 06 | 053 | 011102 | 2 | None | None | 41500 | Salinas, CA | 433212.0 | 171278.0 | 168077.0 | 358.705927 | 0.000000 | 358.705927 | 351.93580 | 4144 | 1035 | 940 | 0.543 | 14 | 0.014894 | 206 | 0.219149 | 720 | 0.765957 | 2253 | 434 | 790 | 1029 | 0.192632 | 412 | 60 | 96 | 68 | 181 | 7 | 60 | 38 | 68 | 37 | 7 | 28 | 116 | 58 | 127 | 148 | 137 | 0.308252 | 2.940877 | 11.774875 | 1.170668 | 0.170486 | 0.272777 | 0.193217 | 0.514298 | 0.019890 | 0.170486 | 0.107974 | 0.193217 | 0.105133 | 0.019890 | 0.079560 | 0.329606 | 0.164803 | 4.111545 | 0.0 | 0.438298 | 0.837517 | 0.837517 | 0.913236 | 0.913236 | 0.572388 | 0.583169 | 0.626348 | 0.704283 | 0.180860 | 0.309193 | 5.468447 | 1.146511e-02 | 24.214972 | 1.575386 | 2.802820 | 19.836766 | 143.381182 | 0.000000 | 14.273531 | 10.705148 | 139.166923 | 30.331252 | 1090.33 | 0.0 | 0.0 | -99999.00 | -99999.000000 | -99999.000000 | 8017.0 | 9431.0 | 21001.0 | 13293.0 | 0.001794 | 0.199522 | 0.002153 | 0.257382 | 0.008519 | 0.588709 | 0.004798 | 0.333752 | 12.0 | 20.0 | 17.0 | 13.0 | 15.333333 | Salinas, CA Metro Area | 919 | 2253 | 4079 | 2604.80 | 2376.0 | 352 | 2130 | 434 | 790 | 1029 | 0.308252 | 0.359223 | 0.332524 | 342 | 1.371403 | 0.775035 | 0.946000 | 3.036719 | 0 | 0 | 0 | 10 | 3.257603 | -0.024294 | 0.035214 | -0.328330 | -0.051495 | -0.375423 | -0.275634 | 0.001028 | 2.336001 | 0.054826 | 0.066514 | -0.187630 | 0.090756 | -0.257335 | -0.000358 | -0.206061 | -0.206061 | 0.103747 | -0.177292 | -0.198867 | -0.115549 | -0.000988 | 127155 | 433410 | 168080 | 289744.40 | 249520 | 0.299342 | 0.509912 | 0.218557 | 0.341141 | 0.440284 | 0.316341 | 3.987867 | 10.315683 | 14.303550 | 9.558585 | 44.831893 | 19.442039 | 12.744463 | 3313.560384 | 86.548001 | 7248.395767 | 1.451665e+06 | MULTIPOLYGON (((-2224166.846 1785356.591, -222... |
| 2 | 060971513084 | 060971513084 | 06 | 097 | 151308 | 4 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 67.135587 | 0.000000 | 67.135587 | 60.29173 | 396 | 169 | 131 | 0.490 | 0 | 0.000000 | 10 | 0.076336 | 121 | 0.923664 | 276 | 46 | 78 | 152 | 0.166667 | 30 | 4 | 0 | 16 | 10 | 0 | 4 | 0 | 16 | 9 | 0 | 0 | 1 | 0 | 6 | 9 | 15 | 0.200000 | 2.803038 | 6.568065 | 0.497581 | 0.066344 | 0.000000 | 0.265376 | 0.165860 | 0.000000 | 0.066344 | 0.000000 | 0.265376 | 0.149274 | 0.000000 | 0.000000 | 0.016586 | 0.000000 | 3.300619 | 0.0 | 0.229008 | 0.883037 | 0.602767 | 0.777957 | 0.518638 | 0.477267 | 0.397559 | 0.395591 | 0.095258 | 0.140845 | 0.196078 | 9.200000 | 2.746536e-04 | 22.454792 | 0.000000 | 0.064477 | 22.390315 | 149.409879 | 0.000000 | 0.000000 | 0.000000 | 181.125994 | 28.598841 | 615.57 | 0.0 | 0.0 | 0.67 | 6.387075 | 0.001692 | 20817.0 | 23238.0 | 3423.0 | 4467.0 | 0.002432 | 0.470739 | 0.002547 | 0.504417 | 0.001056 | 0.058551 | 0.001686 | 0.098464 | 9.0 | 9.0 | 17.0 | 15.0 | 13.666667 | Santa Rosa-Petaluma, CA Metro Area | 160 | 276 | 464 | 273.68 | NaN | 324 | 244 | 46 | 78 | 152 | 0.200000 | 0.300000 | 0.500000 | 342 | 1.335800 | 0.403851 | 0.000000 | 3.152322 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.012110 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.586229 | 36.685037 | 41.271266 | 18.791349 | 72.049792 | 39.609060 | 36.772698 | 9560.901540 | 57.790885 | 2700.963896 | 2.716912e+05 | MULTIPOLYGON (((-2281091.02 2025181.953, -2281... |
| 3 | 060510001022 | 060510001022 | 06 | 051 | 000102 | 2 | None | None | None | 0.0 | 0.0 | 0.0 | 107095.367112 | 49.323878 | 107046.043234 | 6959.85118 | 935 | 508 | 327 | 0.451 | 24 | 0.073394 | 106 | 0.324159 | 197 | 0.602446 | 316 | 82 | 113 | 121 | 0.259494 | 26 | 12 | 2 | 6 | 5 | 1 | 12 | 2 | 6 | 0 | 1 | 0 | 5 | 0 | 11 | 9 | 6 | 0.423077 | 0.072990 | 0.134342 | 0.003736 | 0.001724 | 0.000287 | 0.000862 | 0.000718 | 0.000144 | 0.001724 | 0.000287 | 0.000862 | 0.000000 | 0.000144 | 0.000000 | 0.000718 | 0.000000 | 0.076726 | 0.0 | 0.079511 | 0.829425 | 0.829425 | 0.829425 | 0.641955 | 0.201652 | 0.271701 | 0.290852 | 0.001513 | 0.000000 | 0.000000 | 12.153846 | 1.432010e-05 | 0.954596 | 0.109177 | 0.106369 | 0.739051 | 0.450593 | 0.089681 | 0.125553 | 0.017936 | 0.460363 | 0.041851 | 590.09 | 0.0 | 0.0 | -99999.00 | -99999.000000 | -99999.000000 | 63.0 | 384.0 | 26.0 | 316.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 13.0 | 1.0 | 15.0 | 8.000000 | Mono County | 309 | 316 | 928 | 680.24 | NaN | 671 | 376 | 82 | 113 | 121 | 0.423077 | 0.346154 | 0.230769 | 342 | 0.070449 | 0.003729 | 0.103618 | 0.553339 | 0 | 0 | 0 | 0 | 4.962820 | 0.287264 | 0.138923 | -0.468211 | 0.442641 | -0.521250 | 0.068571 | -0.007097 | 2.192546 | 0.054818 | 0.060232 | -0.169290 | 0.076734 | -0.199936 | -0.000238 | -0.184867 | -0.184867 | 0.079063 | -0.155833 | -0.373872 | -0.121008 | -0.001369 | 4765 | 14310 | 5234 | 10736.00 | 5073 | 0.650454 | 0.533473 | 0.357853 | 0.317730 | 0.324417 | 1.188458 | 6.658247 | 7.491844 | 14.150092 | 10.231154 | 33.306664 | 17.789688 | 12.607732 | 3278.010232 | 83.016896 | 126451.935344 | 4.334090e+08 | MULTIPOLYGON (((-2025810.921 1952822.52, -2025... | |
| 4 | 060971513092 | 060971513092 | 06 | 097 | 151309 | 2 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 570.037214 | 0.000000 | 570.037214 | 471.33595 | 2560 | 896 | 869 | 0.593 | 8 | 0.009206 | 200 | 0.230150 | 661 | 0.760644 | 1406 | 289 | 358 | 759 | 0.205548 | 63 | 1 | 0 | 24 | 36 | 2 | 1 | 0 | 24 | 13 | 2 | 5 | 18 | 0 | 20 | 19 | 24 | 0.317460 | 1.900980 | 5.431370 | 0.133663 | 0.002122 | 0.000000 | 0.050919 | 0.076379 | 0.004243 | 0.002122 | 0.000000 | 0.050919 | 0.027581 | 0.004243 | 0.010608 | 0.038189 | 0.000000 | 2.034642 | 0.0 | 0.072497 | 0.622319 | 0.536037 | 0.796767 | 0.686537 | 0.189938 | 0.234089 | 0.244020 | 0.001481 | 0.048037 | 0.085773 | 22.317460 | 5.520070e-10 | 9.245028 | 1.434447 | 1.089662 | 6.720919 | 42.302361 | 4.490935 | 1.122734 | 2.245467 | 37.050213 | 14.595538 | 742.98 | 0.0 | 0.0 | 0.67 | 0.752232 | 0.000262 | 23024.0 | 26141.0 | 265.0 | 2111.0 | 0.002689 | 0.520646 | 0.002865 | 0.567431 | 0.000082 | 0.004533 | 0.000797 | 0.046532 | 3.0 | 15.0 | 9.0 | 14.0 | 10.666667 | Santa Rosa-Petaluma, CA Metro Area | 966 | 1406 | 2882 | 2041.60 | 2169.0 | 1853 | 1286 | 289 | 358 | 759 | 0.317460 | 0.301587 | 0.380952 | 342 | 1.065048 | 0.125454 | 0.889720 | 2.043933 | 0 | 0 | 1 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.012110 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 6.306880 | 45.213616 | 51.520496 | 18.791349 | 72.049792 | 39.609060 | 45.904762 | 11935.238176 | 38.546556 | 7055.254272 | 2.306910e+06 | MULTIPOLYGON (((-2279605.043 2026679.113, -227... |
california_data.shape
(23212, 182)
Assign counties to outcome variable¶
# Get county names given coutny_fips codes
county_info = pd.read_csv('C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/California_Counties.csv')
county_info['CNTY_FIPS'] = county_info['CNTY_FIPS'].astype("str")
county_info['CNTY_FIPS'] = '0' + county_info['CNTY_FIPS']
county_info = county_info.rename(columns = {'CNTY_FIPS' : 'COUNTYFP', 'NAME' : 'COUNTY_NAME'})
county_info.drop(['OBJECTID', "STATE_NAME", "STATE_FIPS", "FIPS", "Shape__Area", "Shape__Length"], axis=1, inplace=True)
county_info.head()
| COUNTY_NAME | COUNTYFP | |
|---|---|---|
| 0 | Alameda County | 01 |
| 1 | Alpine County | 03 |
| 2 | Amador County | 05 |
| 3 | Butte County | 07 |
| 4 | Calaveras County | 09 |
# Join California data and county_info
california_data = pd.merge(california_data, county_info, how='inner', on = "COUNTYFP")
california_data.head()
| GEOID10 | GEOID20 | STATEFP | COUNTYFP | TRACTCE | BLKGRPCE | CSA | CSA_Name | CBSA | CBSA_Name | CBSA_POP | CBSA_EMP | CBSA_WRK | Ac_Total | Ac_Water | Ac_Land | Ac_Unpr | TotPop | CountHU | HH | P_WrkAge | AutoOwn0 | Pct_AO0 | AutoOwn1 | Pct_AO1 | AutoOwn2p | Pct_AO2p | Workers | R_LowWageWk | R_MedWageWk | R_HiWageWk | R_PCTLOWWAGE | TotEmp | E5_Ret | E5_Off | E5_Ind | E5_Svc | E5_Ent | E8_Ret | E8_off | E8_Ind | E8_Svc | E8_Ent | E8_Ed | E8_Hlth | E8_Pub | E_LowWageWk | E_MedWageWk | E_HiWageWk | E_PctLowWage | D1A | D1B | D1C | D1C5_RET | D1C5_OFF | D1C5_IND | D1C5_SVC | D1C5_ENT | D1C8_RET | D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_ENT | D1C8_ED | D1C8_HLTH | D1C8_PUB | D1D | D1_FLAG | D2A_JPHH | D2B_E5MIX | D2B_E5MIXA | D2B_E8MIX | D2B_E8MIXA | D2A_EPHHM | D2C_TRPMX1 | D2C_TRPMX2 | D2C_TRIPEQ | D2R_JOBPOP | D2R_WRKEMP | D2A_WRKEMP | D2C_WREMLX | D3A | D3AAO | D3AMM | D3APO | D3B | D3BAO | D3BMM3 | D3BMM4 | D3BPO3 | D3BPO4 | D4A | D4B025 | D4B050 | D4C | D4D | D4E | D5AR | D5AE | D5BR | D5BE | D5CR | D5CRI | D5CE | D5CEI | D5DR | D5DRI | D5DE | D5DEI | D2A_Ranked | D2B_Ranked | D3B_Ranked | D4A_Ranked | NatWalkInd | Region | Households | Workers_1 | Residents | Drivers | Vehicles | White | Male | Lowwage | Medwage | Highwage | W_P_Lowwage | W_P_Medwage | W_P_Highwage | GasPrice | logd1a | logd1c | logd3aao | logd3apo | d4bo25 | d5dei_1 | logd4d | UPTpercap | B_C_constant | B_C_male | B_C_ld1c | B_C_drvmveh | B_C_ld1a | B_C_ld3apo | B_C_inc1 | B_C_gasp | B_N_constant | B_N_inc2 | B_N_inc3 | B_N_white | B_N_male | B_N_drvmveh | B_N_gasp | B_N_ld1a | B_N_ld1c | B_N_ld3aao | B_N_ld3apo | B_N_d4bo25 | B_N_d5dei | B_N_UPTpc | C_R_Households | C_R_Pop | C_R_Workers | C_R_Drivers | C_R_Vehicles | C_R_White | C_R_Male | C_R_Lowwage | C_R_Medwage | C_R_Highwage | C_R_DrmV | NonCom_VMT_Per_Worker | Com_VMT_Per_Worker | VMT_per_worker | VMT_tot_min | VMT_tot_max | VMT_tot_avg | GHG_per_worker | Annual_GHG | SLC_score | Shape_Length | Shape_Area | geometry | COUNTY_NAME | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 060530111013 | 060530111013 | 06 | 053 | 011101 | 3 | None | None | 41500 | Salinas, CA | 433212.0 | 171278.0 | 168077.0 | 346.017589 | 0.112678 | 345.904912 | 339.58397 | 3226 | 743 | 712 | 0.522 | 80 | 0.112360 | 191 | 0.268258 | 441 | 0.619382 | 1157 | 247 | 479 | 431 | 0.213483 | 1894 | 122 | 21 | 1348 | 156 | 247 | 122 | 21 | 1348 | 42 | 247 | 0 | 114 | 0 | 1051 | 663 | 180 | 0.554910 | 2.187971 | 9.499859 | 5.577413 | 0.359263 | 0.061840 | 3.969563 | 0.459386 | 0.727361 | 0.359263 | 0.061840 | 3.969563 | 0.123681 | 0.727361 | 0.000000 | 0.335705 | 0.000000 | 7.765384 | 0.0 | 2.660112 | 0.583986 | 0.583986 | 0.551339 | 0.475063 | 0.708537 | 0.825457 | 0.800664 | 0.535121 | 0.739844 | 0.758440 | 0.610876 | 6.776505e-01 | 16.931275 | 2.424792 | 1.102102 | 13.404381 | 88.229565 | 3.700439 | 3.700439 | 1.850219 | 103.612290 | 14.801756 | 653.13 | 0.0 | 0.0 | -99999.00 | -99999.000000 | -99999.000000 | 7803.0 | 9168.0 | 19967.0 | 12233.0 | 0.001746 | 0.194196 | 0.002093 | 0.250205 | 0.008100 | 0.559723 | 0.004415 | 0.307138 | 16.0 | 7.0 | 13.0 | 14.0 | 12.833333 | Salinas, CA Metro Area | 684 | 1157 | 3177 | 1942.16 | 1326.0 | 141 | 1710 | 247 | 479 | 431 | 0.554910 | 0.350053 | 0.095037 | 342 | 1.159385 | 1.883641 | 1.231041 | 2.667532 | 0 | 0 | 0 | 10 | 3.257603 | -0.024294 | 0.035214 | -0.328330 | -0.051495 | -0.375423 | -0.275634 | 0.001028 | 2.336001 | 0.054826 | 0.066514 | -0.187630 | 0.090756 | -0.257335 | -0.000358 | -0.206061 | -0.206061 | 0.103747 | -0.177292 | -0.198867 | -0.115549 | -0.000988 | 127155 | 433410 | 168080 | 289744.40 | 249520 | 0.299342 | 0.509912 | 0.218557 | 0.341141 | 0.440284 | 0.316341 | 3.597743 | 11.637535 | 15.235278 | 9.558585 | 44.831893 | 19.442039 | 13.574633 | 3529.404532 | 83.906547 | 5061.505510 | 1.400314e+06 | MULTIPOLYGON (((-2223551.553 1784238.168, -222... | Monterey County |
| 1 | 060530111022 | 060530111022 | 06 | 053 | 011102 | 2 | None | None | 41500 | Salinas, CA | 433212.0 | 171278.0 | 168077.0 | 358.705927 | 0.000000 | 358.705927 | 351.93580 | 4144 | 1035 | 940 | 0.543 | 14 | 0.014894 | 206 | 0.219149 | 720 | 0.765957 | 2253 | 434 | 790 | 1029 | 0.192632 | 412 | 60 | 96 | 68 | 181 | 7 | 60 | 38 | 68 | 37 | 7 | 28 | 116 | 58 | 127 | 148 | 137 | 0.308252 | 2.940877 | 11.774875 | 1.170668 | 0.170486 | 0.272777 | 0.193217 | 0.514298 | 0.019890 | 0.170486 | 0.107974 | 0.193217 | 0.105133 | 0.019890 | 0.079560 | 0.329606 | 0.164803 | 4.111545 | 0.0 | 0.438298 | 0.837517 | 0.837517 | 0.913236 | 0.913236 | 0.572388 | 0.583169 | 0.626348 | 0.704283 | 0.180860 | 0.309193 | 5.468447 | 1.146511e-02 | 24.214972 | 1.575386 | 2.802820 | 19.836766 | 143.381182 | 0.000000 | 14.273531 | 10.705148 | 139.166923 | 30.331252 | 1090.33 | 0.0 | 0.0 | -99999.00 | -99999.000000 | -99999.000000 | 8017.0 | 9431.0 | 21001.0 | 13293.0 | 0.001794 | 0.199522 | 0.002153 | 0.257382 | 0.008519 | 0.588709 | 0.004798 | 0.333752 | 12.0 | 20.0 | 17.0 | 13.0 | 15.333333 | Salinas, CA Metro Area | 919 | 2253 | 4079 | 2604.80 | 2376.0 | 352 | 2130 | 434 | 790 | 1029 | 0.308252 | 0.359223 | 0.332524 | 342 | 1.371403 | 0.775035 | 0.946000 | 3.036719 | 0 | 0 | 0 | 10 | 3.257603 | -0.024294 | 0.035214 | -0.328330 | -0.051495 | -0.375423 | -0.275634 | 0.001028 | 2.336001 | 0.054826 | 0.066514 | -0.187630 | 0.090756 | -0.257335 | -0.000358 | -0.206061 | -0.206061 | 0.103747 | -0.177292 | -0.198867 | -0.115549 | -0.000988 | 127155 | 433410 | 168080 | 289744.40 | 249520 | 0.299342 | 0.509912 | 0.218557 | 0.341141 | 0.440284 | 0.316341 | 3.987867 | 10.315683 | 14.303550 | 9.558585 | 44.831893 | 19.442039 | 12.744463 | 3313.560384 | 86.548001 | 7248.395767 | 1.451665e+06 | MULTIPOLYGON (((-2224166.846 1785356.591, -222... | Monterey County |
| 2 | 060971513084 | 060971513084 | 06 | 097 | 151308 | 4 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 67.135587 | 0.000000 | 67.135587 | 60.29173 | 396 | 169 | 131 | 0.490 | 0 | 0.000000 | 10 | 0.076336 | 121 | 0.923664 | 276 | 46 | 78 | 152 | 0.166667 | 30 | 4 | 0 | 16 | 10 | 0 | 4 | 0 | 16 | 9 | 0 | 0 | 1 | 0 | 6 | 9 | 15 | 0.200000 | 2.803038 | 6.568065 | 0.497581 | 0.066344 | 0.000000 | 0.265376 | 0.165860 | 0.000000 | 0.066344 | 0.000000 | 0.265376 | 0.149274 | 0.000000 | 0.000000 | 0.016586 | 0.000000 | 3.300619 | 0.0 | 0.229008 | 0.883037 | 0.602767 | 0.777957 | 0.518638 | 0.477267 | 0.397559 | 0.395591 | 0.095258 | 0.140845 | 0.196078 | 9.200000 | 2.746536e-04 | 22.454792 | 0.000000 | 0.064477 | 22.390315 | 149.409879 | 0.000000 | 0.000000 | 0.000000 | 181.125994 | 28.598841 | 615.57 | 0.0 | 0.0 | 0.67 | 6.387075 | 0.001692 | 20817.0 | 23238.0 | 3423.0 | 4467.0 | 0.002432 | 0.470739 | 0.002547 | 0.504417 | 0.001056 | 0.058551 | 0.001686 | 0.098464 | 9.0 | 9.0 | 17.0 | 15.0 | 13.666667 | Santa Rosa-Petaluma, CA Metro Area | 160 | 276 | 464 | 273.68 | NaN | 324 | 244 | 46 | 78 | 152 | 0.200000 | 0.300000 | 0.500000 | 342 | 1.335800 | 0.403851 | 0.000000 | 3.152322 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.012110 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.586229 | 36.685037 | 41.271266 | 18.791349 | 72.049792 | 39.609060 | 36.772698 | 9560.901540 | 57.790885 | 2700.963896 | 2.716912e+05 | MULTIPOLYGON (((-2281091.02 2025181.953, -2281... | Sonoma County |
| 3 | 060510001022 | 060510001022 | 06 | 051 | 000102 | 2 | None | None | None | 0.0 | 0.0 | 0.0 | 107095.367112 | 49.323878 | 107046.043234 | 6959.85118 | 935 | 508 | 327 | 0.451 | 24 | 0.073394 | 106 | 0.324159 | 197 | 0.602446 | 316 | 82 | 113 | 121 | 0.259494 | 26 | 12 | 2 | 6 | 5 | 1 | 12 | 2 | 6 | 0 | 1 | 0 | 5 | 0 | 11 | 9 | 6 | 0.423077 | 0.072990 | 0.134342 | 0.003736 | 0.001724 | 0.000287 | 0.000862 | 0.000718 | 0.000144 | 0.001724 | 0.000287 | 0.000862 | 0.000000 | 0.000144 | 0.000000 | 0.000718 | 0.000000 | 0.076726 | 0.0 | 0.079511 | 0.829425 | 0.829425 | 0.829425 | 0.641955 | 0.201652 | 0.271701 | 0.290852 | 0.001513 | 0.000000 | 0.000000 | 12.153846 | 1.432010e-05 | 0.954596 | 0.109177 | 0.106369 | 0.739051 | 0.450593 | 0.089681 | 0.125553 | 0.017936 | 0.460363 | 0.041851 | 590.09 | 0.0 | 0.0 | -99999.00 | -99999.000000 | -99999.000000 | 63.0 | 384.0 | 26.0 | 316.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 13.0 | 1.0 | 15.0 | 8.000000 | Mono County | 309 | 316 | 928 | 680.24 | NaN | 671 | 376 | 82 | 113 | 121 | 0.423077 | 0.346154 | 0.230769 | 342 | 0.070449 | 0.003729 | 0.103618 | 0.553339 | 0 | 0 | 0 | 0 | 4.962820 | 0.287264 | 0.138923 | -0.468211 | 0.442641 | -0.521250 | 0.068571 | -0.007097 | 2.192546 | 0.054818 | 0.060232 | -0.169290 | 0.076734 | -0.199936 | -0.000238 | -0.184867 | -0.184867 | 0.079063 | -0.155833 | -0.373872 | -0.121008 | -0.001369 | 4765 | 14310 | 5234 | 10736.00 | 5073 | 0.650454 | 0.533473 | 0.357853 | 0.317730 | 0.324417 | 1.188458 | 6.658247 | 7.491844 | 14.150092 | 10.231154 | 33.306664 | 17.789688 | 12.607732 | 3278.010232 | 83.016896 | 126451.935344 | 4.334090e+08 | MULTIPOLYGON (((-2025810.921 1952822.52, -2025... | Mono County | |
| 4 | 060971513092 | 060971513092 | 06 | 097 | 151309 | 2 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 570.037214 | 0.000000 | 570.037214 | 471.33595 | 2560 | 896 | 869 | 0.593 | 8 | 0.009206 | 200 | 0.230150 | 661 | 0.760644 | 1406 | 289 | 358 | 759 | 0.205548 | 63 | 1 | 0 | 24 | 36 | 2 | 1 | 0 | 24 | 13 | 2 | 5 | 18 | 0 | 20 | 19 | 24 | 0.317460 | 1.900980 | 5.431370 | 0.133663 | 0.002122 | 0.000000 | 0.050919 | 0.076379 | 0.004243 | 0.002122 | 0.000000 | 0.050919 | 0.027581 | 0.004243 | 0.010608 | 0.038189 | 0.000000 | 2.034642 | 0.0 | 0.072497 | 0.622319 | 0.536037 | 0.796767 | 0.686537 | 0.189938 | 0.234089 | 0.244020 | 0.001481 | 0.048037 | 0.085773 | 22.317460 | 5.520070e-10 | 9.245028 | 1.434447 | 1.089662 | 6.720919 | 42.302361 | 4.490935 | 1.122734 | 2.245467 | 37.050213 | 14.595538 | 742.98 | 0.0 | 0.0 | 0.67 | 0.752232 | 0.000262 | 23024.0 | 26141.0 | 265.0 | 2111.0 | 0.002689 | 0.520646 | 0.002865 | 0.567431 | 0.000082 | 0.004533 | 0.000797 | 0.046532 | 3.0 | 15.0 | 9.0 | 14.0 | 10.666667 | Santa Rosa-Petaluma, CA Metro Area | 966 | 1406 | 2882 | 2041.60 | 2169.0 | 1853 | 1286 | 289 | 358 | 759 | 0.317460 | 0.301587 | 0.380952 | 342 | 1.065048 | 0.125454 | 0.889720 | 2.043933 | 0 | 0 | 1 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.012110 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 6.306880 | 45.213616 | 51.520496 | 18.791349 | 72.049792 | 39.609060 | 45.904762 | 11935.238176 | 38.546556 | 7055.254272 | 2.306910e+06 | MULTIPOLYGON (((-2279605.043 2026679.113, -227... | Sonoma County |
# Number of census blocks for each county
print(len(california_data['COUNTY_NAME'].value_counts()))
california_data['COUNTY_NAME'].value_counts()
45
COUNTY_NAME Los Angeles County 6425 Orange County 1823 San Diego County 1795 San Bernardino County 1092 Santa Clara County 1075 Riverside County 1030 Sacramento County 912 Contra Costa County 637 Fresno County 589 San Francisco County 581 San Mateo County 463 Kern County 456 San Joaquin County 395 Sonoma County 387 Santa Barbara County 313 Stanislaus County 308 Solano County 285 Monterey County 233 Placer County 213 Santa Cruz County 196 Marin County 175 San Luis Obispo County 163 Merced County 144 Shasta County 131 El Dorado County 125 Humboldt County 108 Napa County 106 Imperial County 96 Kings County 81 Madera County 80 Mendocino County 79 Nevada County 75 Lake County 48 San Benito County 40 Siskiyou County 37 Lassen County 25 Glenn County 23 Plumas County 22 Colusa County 20 Del Norte County 20 Mono County 17 Mariposa County 17 Inyo County 16 Modoc County 12 Sierra County 5 Name: count, dtype: int64
# Create target labels
california_data['target'] = 'Unknown County'
california_data.loc[california_data['COUNTY_NAME'].isin(['Butte County', 'Colusa County', 'Del Norte County',
'Glenn County', 'Humboldt County', 'Lake County',
'Lassen County', 'Mendocino County', 'Modoc County',
'Nevada County', 'Plumas County', 'Shasta County',
'Sierra County', 'Siskiyou County', 'Tehama County',
'Trinity County']), 'target'] = 'Northern California'
california_data.loc[california_data['COUNTY_NAME'].isin(['Alameda County', 'Contra Costa County', 'Marin County',
'Monterey County', 'Napa County', 'San Benito County',
'San Francisco County', 'San Mateo County', 'Santa Clara County',
'Santa Cruz County', 'Solano County', 'Sonoma County'
]), 'target'] = 'Bay Area'
california_data.loc[california_data['COUNTY_NAME'].isin(['Imperial County', 'Kern County', 'Los Angeles County',
'Orange County', 'Riverside County', 'San Bernardino County',
'San Diego County', 'San Luis Obispo County', 'Santa Barbara County',
'Ventura County', 'Tri-City County']), 'target'] = 'Southern California'
california_data.loc[california_data['COUNTY_NAME'].isin(['Alpine County', 'Amador County', 'Calaveras County',
'El Dorado County', 'Fresno County', 'Inyo County',
'Kings County', 'Madera County', 'Mariposa County',
'Merced County', 'Mono County', 'Placer County',
'Sacramento County', 'San Joaquin County', 'Stanislaus County',
'Sutter County', 'Yuba County', 'Tulare County',
'Tuolumne County', 'Yolo County']), 'target'] = 'Central California'
print(len(california_data['target'].value_counts()))
california_data['target'].value_counts()
4
target Southern California 13193 Bay Area 4178 Central California 2897 Northern California 605 Name: count, dtype: int64
california_data.shape
(20873, 184)
Missing Values/Outliers/Errors¶
# See which columns have nulls and how many
null_cols = california_data.columns[california_data.isna().any()]
null_counts = california_data[null_cols].isna().sum()
print(null_counts)
CSA 3505 CSA_Name 3505 CBSA_Name 169 D1C8_OFF 20 Vehicles 2266 dtype: int64
sns.histplot(california_data, x= 'Ac_Land', log_scale=True)
<AxesSubplot: xlabel='Ac_Land', ylabel='Count'>
sns.histplot(california_data, )
Investigate Vehicles column¶
california_data['Vehicles'].value_counts()
Vehicles
762.0 30
815.0 28
898.0 27
747.0 26
951.0 26
..
3638.0 1
2735.0 1
4953.0 1
3593.0 1
235.0 1
Name: count, Length: 2778, dtype: int64
null_vehicle_rows = california_data[california_data['Vehicles'].isna()]
(null_vehicle_rows['COUNTY_NAME'].value_counts() / california_data['COUNTY_NAME'].value_counts()).sort_values(ascending=False)
COUNTY_NAME Mono County 0.470588 Sierra County 0.400000 El Dorado County 0.184000 Imperial County 0.156250 Madera County 0.150000 Del Norte County 0.150000 Contra Costa County 0.147567 Plumas County 0.136364 Placer County 0.131455 Glenn County 0.130435 Kern County 0.127193 Marin County 0.125714 Orange County 0.125617 San Bernardino County 0.122711 Los Angeles County 0.120623 Nevada County 0.120000 Santa Barbara County 0.118211 Mariposa County 0.117647 San Mateo County 0.107991 Sonoma County 0.105943 Lake County 0.104167 San Diego County 0.103621 Santa Clara County 0.103256 Humboldt County 0.101852 Santa Cruz County 0.096939 Fresno County 0.095076 Monterey County 0.094421 San Francisco County 0.087780 Riverside County 0.084466 Modoc County 0.083333 Lassen County 0.080000 Napa County 0.075472 Kings County 0.074074 Sacramento County 0.070175 Solano County 0.070175 Inyo County 0.062500 Stanislaus County 0.061688 San Luis Obispo County 0.061350 Merced County 0.055556 San Joaquin County 0.050633 Colusa County 0.050000 Shasta County 0.045802 Mendocino County 0.037975 San Benito County 0.025000 Siskiyou County NaN Name: count, dtype: float64
null_amount = null_vehicle_rows[california_data['COUNTY_NAME'].isin(['Mono County', 'Sierra County'])]
null_amount['COUNTY_NAME'].value_counts()
c:\Users\rdn91\AppData\Local\Programs\Python\Python311\Lib\site-packages\geopandas\geodataframe.py:1750: UserWarning: Boolean Series key will be reindexed to match DataFrame index. result = super().__getitem__(key)
COUNTY_NAME Mono County 8 Sierra County 2 Name: count, dtype: int64
original_amount = california_data[california_data['COUNTY_NAME'].isin(['Mono County', 'Sierra County'])]
original_amount['COUNTY_NAME'].value_counts()
COUNTY_NAME Mono County 17 Sierra County 5 Name: count, dtype: int64
mono_sierra = california_data[california_data['COUNTY_NAME'].isin(['Mono County', 'Sierra County'])]
other_counties = california_data[california_data['COUNTY_NAME'].isin(['Orange County'])]
x_var = "Vehicles"
fig, ax = plt.subplots(2,2, figsize=(20,10))
sns.histplot(data=mono_sierra, x=x_var, hue="COUNTY_NAME", ax =ax[0, 0]).set_title("Histogram of Vehicles")
sns.kdeplot(data=mono_sierra, x=x_var, hue="COUNTY_NAME", ax=ax[0,1]).set_title("KDE plot of Vehicles")
sns.histplot(data=other_counties, x=x_var, ax =ax[1, 0]).set_title("Histogram of Vehicles for all other counties")
sns.kdeplot(data=other_counties, x=x_var, ax=ax[1,1]).set_title("KDE plot of Vehicles for all other counties")
plt.subplots_adjust(hspace=0.3)
After reading the supporting data dictionaries, there is not enough information to understand what the "Vehicles" column means. Also, since the counties don't have that much data, the "discreteness" can be quite an issue later on. Therefore, I will drop it.
Investigate D1C8_OFF column¶
california_data['D1C8_OFF'].value_counts().sort_values()
D1C8_OFF
0.064529 1
1.710945 1
0.108574 1
0.013567 1
0.667934 1
...
0.170265 2
0.172473 2
0.625374 2
0.061783 2
0.000000 5946
Name: count, Length: 14902, dtype: int64
sns.kdeplot(data=california_data, x="D1C8_OFF")
<AxesSubplot: xlabel='D1C8_OFF', ylabel='Density'>
null_dlc_rows = california_data[california_data['D1C8_OFF'].isna()]
(null_dlc_rows['COUNTY_NAME'].value_counts() / california_data['COUNTY_NAME'].value_counts()).sort_values(ascending=False)
COUNTY_NAME Del Norte County 0.050000 Mendocino County 0.012658 Humboldt County 0.009259 El Dorado County 0.008000 San Luis Obispo County 0.006135 Marin County 0.005714 Santa Cruz County 0.005102 Placer County 0.004695 Monterey County 0.004292 Santa Barbara County 0.003195 Sonoma County 0.002584 San Mateo County 0.002160 San Francisco County 0.001721 Contra Costa County 0.001570 Los Angeles County 0.000623 San Diego County 0.000557 Orange County 0.000549 Colusa County NaN Fresno County NaN Glenn County NaN Imperial County NaN Inyo County NaN Kern County NaN Kings County NaN Lake County NaN Lassen County NaN Madera County NaN Mariposa County NaN Merced County NaN Modoc County NaN Mono County NaN Napa County NaN Nevada County NaN Plumas County NaN Riverside County NaN Sacramento County NaN San Benito County NaN San Bernardino County NaN San Joaquin County NaN Santa Clara County NaN Shasta County NaN Sierra County NaN Siskiyou County NaN Solano County NaN Stanislaus County NaN Name: count, dtype: float64
# Let's look at the distribution for each county that has a missing value for dlc8_off
null_dlc_counties = null_dlc_rows['COUNTY_NAME'].values.tolist()
dlc_cols = null_dlc_rows.columns.tolist()
dlc_cols = [element for element in dlc_cols if element.startswith("D1C")]
dlc_cols
['D1C', 'D1C5_RET', 'D1C5_OFF', 'D1C5_IND', 'D1C5_SVC', 'D1C5_ENT', 'D1C8_RET', 'D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_ENT', 'D1C8_ED', 'D1C8_HLTH', 'D1C8_PUB']
# Find all columns that have weird values (-9999 or 0)
negative_cols = california_data.columns[california_data.isin([-99999.0]).any()].tolist()
zero_val_cols = california_data.columns[california_data.isin([0]).any()].tolist()
print(negative_cols)
print(zero_val_cols)
['D4A', 'D4C', 'D4D', 'D4E', 'D5BR', 'D5BE', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI'] ['CBSA_POP', 'CBSA_EMP', 'CBSA_WRK', 'Ac_Water', 'Ac_Land', 'Ac_Unpr', 'TotPop', 'CountHU', 'HH', 'P_WrkAge', 'AutoOwn0', 'Pct_AO0', 'AutoOwn1', 'Pct_AO1', 'AutoOwn2p', 'Pct_AO2p', 'Workers', 'R_LowWageWk', 'R_MedWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'E5_Ret', 'E5_Off', 'E5_Ind', 'E5_Svc', 'E5_Ent', 'E8_Ret', 'E8_off', 'E8_Ind', 'E8_Svc', 'E8_Ent', 'E8_Ed', 'E8_Hlth', 'E8_Pub', 'E_LowWageWk', 'E_MedWageWk', 'E_HiWageWk', 'E_PctLowWage', 'D1A', 'D1B', 'D1C', 'D1C5_RET', 'D1C5_OFF', 'D1C5_IND', 'D1C5_SVC', 'D1C5_ENT', 'D1C8_RET', 'D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_ENT', 'D1C8_ED', 'D1C8_HLTH', 'D1C8_PUB', 'D1D', 'D1_FLAG', 'D2A_JPHH', 'D2B_E5MIX', 'D2B_E5MIXA', 'D2B_E8MIX', 'D2B_E8MIXA', 'D2A_EPHHM', 'D2C_TRPMX1', 'D2C_TRPMX2', 'D2C_TRIPEQ', 'D2R_JOBPOP', 'D2R_WRKEMP', 'D2A_WRKEMP', 'D2C_WREMLX', 'D3A', 'D3AAO', 'D3AMM', 'D3APO', 'D3B', 'D3BAO', 'D3BMM3', 'D3BMM4', 'D3BPO3', 'D3BPO4', 'D4A', 'D4B025', 'D4B050', 'D4E', 'D5AR', 'D5AE', 'D5BR', 'D5CR', 'D5CRI', 'D5CE', 'D5CEI', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI', 'Households', 'Residents', 'Drivers', 'White', 'Male', 'Lowwage', 'Medwage', 'Highwage', 'W_P_Lowwage', 'W_P_Medwage', 'W_P_Highwage', 'logd1a', 'logd1c', 'logd3aao', 'logd3apo', 'd4bo25', 'd5dei_1', 'logd4d', 'UPTpercap', 'B_N_UPTpc', 'SLC_score']
plot_cols(california_data, dlc_cols, 4, "histplot")
All the D1C columns have very similar distributions and are the only columns that seem to have "-99999" value.
row_length = []
for column in negative_cols:
temp = null_dlc_rows[null_dlc_rows[column] == -99999.0]
rows, columns = temp.shape
row_length.append(rows)
if len(set(row_length)) == 1:
print("All of the D1C8_OFF columns that have a -99999 values also has the same value for other D1C columns.")
else:
print("Not all D1C columns match one to one in terms of rows with -99999 values.")
print(row_length)
california_data[california_data['D1C8_OFF'].isna()].shape
All of the D1C8_OFF columns that have a -99999 values also has the same value for other D1C columns. [20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
(20, 184)
# Is it the same for the original data?
row_length = []
for column in negative_cols:
temp = california_data[california_data[column] == -99999.0]
rows, columns = temp.shape
row_length.append(rows)
if len(set(row_length)) == 1:
print("All of the D1C8_OFF columns that have a -99999 values also has the same value for other D1C columns.")
else:
print("Not all D1C columns match one to one in terms of rows with -99999 values.")
print(row_length)
Not all D1C columns match one to one in terms of rows with -99999 values. [5289, 5931, 5931, 5931, 5289, 5289, 5289, 5289, 5289, 5289]
delete_negative_val = california_data[(california_data != -99999.0).all(axis=1)]
delete_negative_val.shape
(13471, 184)
delete_negative_val.columns[delete_negative_val.isin([-99999.0]).any()].tolist()
[]
delete_negative_val.columns[delete_negative_val.isin([-99999.0]).any()].tolist()
[]
california_data = delete_negative_val.dropna(subset=negative_cols, how='any')
california_data.shape
(13471, 184)
Check nulls and weird values again¶
california_data[(california_data == -99999).any(axis=1)]
| GEOID10 | GEOID20 | STATEFP | COUNTYFP | TRACTCE | BLKGRPCE | CSA | CSA_Name | CBSA | CBSA_Name | CBSA_POP | CBSA_EMP | CBSA_WRK | Ac_Total | Ac_Water | Ac_Land | Ac_Unpr | TotPop | CountHU | HH | P_WrkAge | AutoOwn0 | Pct_AO0 | AutoOwn1 | Pct_AO1 | AutoOwn2p | Pct_AO2p | Workers | R_LowWageWk | R_MedWageWk | R_HiWageWk | R_PCTLOWWAGE | TotEmp | E5_Ret | E5_Off | E5_Ind | E5_Svc | E5_Ent | E8_Ret | E8_off | E8_Ind | E8_Svc | E8_Ent | E8_Ed | E8_Hlth | E8_Pub | E_LowWageWk | E_MedWageWk | E_HiWageWk | E_PctLowWage | D1A | D1B | D1C | D1C5_RET | D1C5_OFF | D1C5_IND | D1C5_SVC | D1C5_ENT | D1C8_RET | D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_ENT | D1C8_ED | D1C8_HLTH | D1C8_PUB | D1D | D1_FLAG | D2A_JPHH | D2B_E5MIX | D2B_E5MIXA | D2B_E8MIX | D2B_E8MIXA | D2A_EPHHM | D2C_TRPMX1 | D2C_TRPMX2 | D2C_TRIPEQ | D2R_JOBPOP | D2R_WRKEMP | D2A_WRKEMP | D2C_WREMLX | D3A | D3AAO | D3AMM | D3APO | D3B | D3BAO | D3BMM3 | D3BMM4 | D3BPO3 | D3BPO4 | D4A | D4B025 | D4B050 | D4C | D4D | D4E | D5AR | D5AE | D5BR | D5BE | D5CR | D5CRI | D5CE | D5CEI | D5DR | D5DRI | D5DE | D5DEI | D2A_Ranked | D2B_Ranked | D3B_Ranked | D4A_Ranked | NatWalkInd | Region | Households | Workers_1 | Residents | Drivers | Vehicles | White | Male | Lowwage | Medwage | Highwage | W_P_Lowwage | W_P_Medwage | W_P_Highwage | GasPrice | logd1a | logd1c | logd3aao | logd3apo | d4bo25 | d5dei_1 | logd4d | UPTpercap | B_C_constant | B_C_male | B_C_ld1c | B_C_drvmveh | B_C_ld1a | B_C_ld3apo | B_C_inc1 | B_C_gasp | B_N_constant | B_N_inc2 | B_N_inc3 | B_N_white | B_N_male | B_N_drvmveh | B_N_gasp | B_N_ld1a | B_N_ld1c | B_N_ld3aao | B_N_ld3apo | B_N_d4bo25 | B_N_d5dei | B_N_UPTpc | C_R_Households | C_R_Pop | C_R_Workers | C_R_Drivers | C_R_Vehicles | C_R_White | C_R_Male | C_R_Lowwage | C_R_Medwage | C_R_Highwage | C_R_DrmV | NonCom_VMT_Per_Worker | Com_VMT_Per_Worker | VMT_per_worker | VMT_tot_min | VMT_tot_max | VMT_tot_avg | GHG_per_worker | Annual_GHG | SLC_score | Shape_Length | Shape_Area | geometry | COUNTY_NAME | target |
|---|
numeric_cols = california_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
california_data[numeric_cols].columns[(california_data[numeric_cols] < 0 ).all()] # all the columns that have only negative values (it's not an error)
Index(['B_N_white', 'B_N_drvmveh', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo',
'B_N_d4bo25'],
dtype='object')
california_data[numeric_cols].columns[(california_data[numeric_cols] < 0 ).any()] # all the columns that contain at least one negative value
Index(['B_C_male', 'B_C_ld1c', 'B_C_drvmveh', 'B_C_ld1a', 'B_C_ld3apo',
'B_C_inc1', 'B_C_gasp', 'B_N_inc2', 'B_N_inc3', 'B_N_white',
'B_N_drvmveh', 'B_N_gasp', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo',
'B_N_d4bo25', 'B_N_d5dei', 'B_N_UPTpc', 'C_R_DrmV'],
dtype='object')
california_data['B_C_male'].value_counts() # check some of those columns to confirm
B_C_male
0.011455 87
0.113615 81
-0.030399 59
0.092642 57
0.075932 55
..
-0.120785 1
0.210880 1
0.196435 1
0.146326 1
-0.029296 1
Name: count, Length: 2252, dtype: int64
california_data.columns[california_data.isna().any()]
Index(['CSA', 'CSA_Name', 'CBSA_Name', 'Vehicles'], dtype='object')
# Drop the two null numerical columns
#california_data.drop(['Vehicles'], axis=1, inplace=True)
california_data.head()
| GEOID10 | GEOID20 | STATEFP | COUNTYFP | TRACTCE | BLKGRPCE | CSA | CSA_Name | CBSA | CBSA_Name | CBSA_POP | CBSA_EMP | CBSA_WRK | Ac_Total | Ac_Water | Ac_Land | Ac_Unpr | TotPop | CountHU | HH | P_WrkAge | AutoOwn0 | Pct_AO0 | AutoOwn1 | Pct_AO1 | AutoOwn2p | Pct_AO2p | Workers | R_LowWageWk | R_MedWageWk | R_HiWageWk | R_PCTLOWWAGE | TotEmp | E5_Ret | E5_Off | E5_Ind | E5_Svc | E5_Ent | E8_Ret | E8_off | E8_Ind | E8_Svc | E8_Ent | E8_Ed | E8_Hlth | E8_Pub | E_LowWageWk | E_MedWageWk | E_HiWageWk | E_PctLowWage | D1A | D1B | D1C | D1C5_RET | D1C5_OFF | D1C5_IND | D1C5_SVC | D1C5_ENT | D1C8_RET | D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_ENT | D1C8_ED | D1C8_HLTH | D1C8_PUB | D1D | D1_FLAG | D2A_JPHH | D2B_E5MIX | D2B_E5MIXA | D2B_E8MIX | D2B_E8MIXA | D2A_EPHHM | D2C_TRPMX1 | D2C_TRPMX2 | D2C_TRIPEQ | D2R_JOBPOP | D2R_WRKEMP | D2A_WRKEMP | D2C_WREMLX | D3A | D3AAO | D3AMM | D3APO | D3B | D3BAO | D3BMM3 | D3BMM4 | D3BPO3 | D3BPO4 | D4A | D4B025 | D4B050 | D4C | D4D | D4E | D5AR | D5AE | D5BR | D5BE | D5CR | D5CRI | D5CE | D5CEI | D5DR | D5DRI | D5DE | D5DEI | D2A_Ranked | D2B_Ranked | D3B_Ranked | D4A_Ranked | NatWalkInd | Region | Households | Workers_1 | Residents | Drivers | Vehicles | White | Male | Lowwage | Medwage | Highwage | W_P_Lowwage | W_P_Medwage | W_P_Highwage | GasPrice | logd1a | logd1c | logd3aao | logd3apo | d4bo25 | d5dei_1 | logd4d | UPTpercap | B_C_constant | B_C_male | B_C_ld1c | B_C_drvmveh | B_C_ld1a | B_C_ld3apo | B_C_inc1 | B_C_gasp | B_N_constant | B_N_inc2 | B_N_inc3 | B_N_white | B_N_male | B_N_drvmveh | B_N_gasp | B_N_ld1a | B_N_ld1c | B_N_ld3aao | B_N_ld3apo | B_N_d4bo25 | B_N_d5dei | B_N_UPTpc | C_R_Households | C_R_Pop | C_R_Workers | C_R_Drivers | C_R_Vehicles | C_R_White | C_R_Male | C_R_Lowwage | C_R_Medwage | C_R_Highwage | C_R_DrmV | NonCom_VMT_Per_Worker | Com_VMT_Per_Worker | VMT_per_worker | VMT_tot_min | VMT_tot_max | VMT_tot_avg | GHG_per_worker | Annual_GHG | SLC_score | Shape_Length | Shape_Area | geometry | COUNTY_NAME | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 060971513084 | 060971513084 | 06 | 097 | 151308 | 4 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 67.135587 | 0.0 | 67.135587 | 60.291730 | 396 | 169 | 131 | 0.490 | 0 | 0.000000 | 10 | 0.076336 | 121 | 0.923664 | 276 | 46 | 78 | 152 | 0.166667 | 30 | 4 | 0 | 16 | 10 | 0 | 4 | 0 | 16 | 9 | 0 | 0 | 1 | 0 | 6 | 9 | 15 | 0.200000 | 2.803038 | 6.568065 | 0.497581 | 0.066344 | 0.000000 | 0.265376 | 0.165860 | 0.000000 | 0.066344 | 0.000000 | 0.265376 | 0.149274 | 0.000000 | 0.000000 | 0.016586 | 0.0 | 3.300619 | 0.0 | 0.229008 | 0.883037 | 0.602767 | 0.777957 | 0.518638 | 0.477267 | 0.397559 | 0.395591 | 9.525833e-02 | 0.140845 | 0.196078 | 9.200000 | 2.746536e-04 | 22.454792 | 0.000000 | 0.064477 | 22.390315 | 149.409879 | 0.000000 | 0.000000 | 0.000000 | 181.125994 | 28.598841 | 615.57 | 0.00000 | 0.000000 | 0.67 | 6.387075 | 0.001692 | 20817.0 | 23238.0 | 3423.0 | 4467.0 | 0.002432 | 0.470739 | 0.002547 | 0.504417 | 0.001056 | 0.058551 | 0.001686 | 0.098464 | 9.0 | 9.0 | 17.0 | 15.0 | 13.666667 | Santa Rosa-Petaluma, CA Metro Area | 160 | 276 | 464 | 273.68 | NaN | 324 | 244 | 46 | 78 | 152 | 0.200000 | 0.300000 | 0.500000 | 342 | 1.335800 | 0.403851 | 0.00000 | 3.152322 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.586229 | 36.685037 | 41.271266 | 18.791349 | 72.049792 | 39.60906 | 36.772698 | 9560.901540 | 57.790885 | 2700.963896 | 2.716912e+05 | MULTIPOLYGON (((-2281091.02 2025181.953, -2281... | Sonoma County | Bay Area |
| 4 | 060971513092 | 060971513092 | 06 | 097 | 151309 | 2 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 570.037214 | 0.0 | 570.037214 | 471.335950 | 2560 | 896 | 869 | 0.593 | 8 | 0.009206 | 200 | 0.230150 | 661 | 0.760644 | 1406 | 289 | 358 | 759 | 0.205548 | 63 | 1 | 0 | 24 | 36 | 2 | 1 | 0 | 24 | 13 | 2 | 5 | 18 | 0 | 20 | 19 | 24 | 0.317460 | 1.900980 | 5.431370 | 0.133663 | 0.002122 | 0.000000 | 0.050919 | 0.076379 | 0.004243 | 0.002122 | 0.000000 | 0.050919 | 0.027581 | 0.004243 | 0.010608 | 0.038189 | 0.0 | 2.034642 | 0.0 | 0.072497 | 0.622319 | 0.536037 | 0.796767 | 0.686537 | 0.189938 | 0.234089 | 0.244020 | 1.481149e-03 | 0.048037 | 0.085773 | 22.317460 | 5.520070e-10 | 9.245028 | 1.434447 | 1.089662 | 6.720919 | 42.302361 | 4.490935 | 1.122734 | 2.245467 | 37.050213 | 14.595538 | 742.98 | 0.00000 | 0.000000 | 0.67 | 0.752232 | 0.000262 | 23024.0 | 26141.0 | 265.0 | 2111.0 | 0.002689 | 0.520646 | 0.002865 | 0.567431 | 0.000082 | 0.004533 | 0.000797 | 0.046532 | 3.0 | 15.0 | 9.0 | 14.0 | 10.666667 | Santa Rosa-Petaluma, CA Metro Area | 966 | 1406 | 2882 | 2041.60 | 2169.0 | 1853 | 1286 | 289 | 358 | 759 | 0.317460 | 0.301587 | 0.380952 | 342 | 1.065048 | 0.125454 | 0.88972 | 2.043933 | 0 | 0 | 1 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 6.306880 | 45.213616 | 51.520496 | 18.791349 | 72.049792 | 39.60906 | 45.904762 | 11935.238176 | 38.546556 | 7055.254272 | 2.306910e+06 | MULTIPOLYGON (((-2279605.043 2026679.113, -227... | Sonoma County | Bay Area |
| 5 | 060971513093 | 060971513093 | 06 | 097 | 151309 | 3 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 982.997406 | 0.0 | 982.997406 | 982.846330 | 1610 | 712 | 648 | 0.574 | 77 | 0.118827 | 187 | 0.288580 | 384 | 0.592593 | 665 | 149 | 171 | 345 | 0.224060 | 282 | 0 | 0 | 31 | 245 | 6 | 0 | 0 | 31 | 5 | 6 | 61 | 179 | 0 | 83 | 104 | 95 | 0.294326 | 0.724427 | 1.638099 | 0.286922 | 0.000000 | 0.000000 | 0.031541 | 0.249276 | 0.006105 | 0.000000 | 0.000000 | 0.031541 | 0.005087 | 0.006105 | 0.062065 | 0.182124 | 0.0 | 1.011348 | 0.0 | 0.435185 | 0.406720 | 0.277630 | 0.631164 | 0.488506 | 0.540334 | 0.441643 | 0.477021 | 9.053520e-01 | 0.298097 | 0.595565 | 2.358156 | 2.571345e-01 | 6.367728 | 0.795925 | 0.849893 | 4.721909 | 21.711227 | 1.302140 | 4.557489 | 0.000000 | 22.136376 | 3.906419 | 1098.38 | 0.00000 | 0.000000 | 0.67 | 0.436217 | 0.000416 | 22544.0 | 25770.0 | 368.0 | 1478.0 | 0.002633 | 0.509792 | 0.002824 | 0.559378 | 0.000114 | 0.006295 | 0.000558 | 0.032579 | 11.0 | 8.0 | 7.0 | 13.0 | 9.833333 | Santa Rosa-Petaluma, CA Metro Area | 660 | 665 | 1722 | 1156.32 | 1084.0 | 1283 | 647 | 149 | 171 | 345 | 0.294326 | 0.368794 | 0.336879 | 342 | 0.544895 | 0.252253 | 0.58552 | 1.744303 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 7.067722 | 45.780375 | 52.848097 | 18.791349 | 72.049792 | 39.60906 | 47.087654 | 12242.790059 | 36.053805 | 8233.425925 | 3.978132e+06 | MULTIPOLYGON (((-2279501.685 2025691.2, -22794... | Sonoma County | Bay Area |
| 6 | 060971513101 | 060971513101 | 06 | 097 | 151310 | 1 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 51.480814 | 0.0 | 51.480814 | 51.080456 | 506 | 199 | 182 | 0.709 | 0 | 0.000000 | 35 | 0.192308 | 147 | 0.807692 | 268 | 63 | 73 | 132 | 0.235075 | 142 | 14 | 0 | 8 | 19 | 101 | 14 | 0 | 8 | 13 | 101 | 0 | 6 | 0 | 96 | 37 | 9 | 0.676056 | 3.895815 | 9.905941 | 2.779928 | 0.274077 | 0.000000 | 0.156616 | 0.371962 | 1.977273 | 0.274077 | 0.000000 | 0.156616 | 0.254500 | 1.977273 | 0.000000 | 0.117462 | 0.0 | 6.675743 | 0.0 | 0.780220 | 0.650603 | 0.560399 | 0.612247 | 0.473864 | 0.671536 | 0.565885 | 0.621614 | 5.384277e-01 | 0.438272 | 0.692683 | 1.887324 | 4.117562e-01 | 27.510900 | 0.000000 | 9.572344 | 17.938556 | 116.063433 | 0.000000 | 12.431816 | 24.863632 | 87.022711 | 24.863632 | 580.70 | 0.00000 | 0.479764 | 2.00 | 24.863632 | 0.003953 | 21597.0 | 25470.0 | 185.0 | 1195.0 | 0.002523 | 0.488377 | 0.002791 | 0.552866 | 0.000057 | 0.003164 | 0.000451 | 0.026341 | 15.0 | 7.0 | 15.0 | 15.0 | 13.666667 | Santa Rosa-Petaluma, CA Metro Area | 200 | 268 | 552 | 383.68 | 365.0 | 317 | 190 | 63 | 73 | 132 | 0.676056 | 0.260563 | 0.063380 | 342 | 1.588381 | 1.329705 | 0.00000 | 2.941200 | 0 | 0 | 3 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 3.675845 | 24.646966 | 28.322812 | 18.791349 | 72.049792 | 39.60906 | 25.235625 | 6561.262585 | 82.103377 | 1863.752846 | 2.083395e+05 | MULTIPOLYGON (((-2280677.682 2023214.453, -228... | Sonoma County | Bay Area |
| 7 | 060971513102 | 060971513102 | 06 | 097 | 151310 | 2 | 488 | San Jose-San Francisco-Oakland, CA | 42220 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 94.734433 | 0.0 | 94.734433 | 89.936999 | 1944 | 700 | 674 | 0.653 | 68 | 0.100890 | 97 | 0.143917 | 509 | 0.755193 | 502 | 93 | 166 | 243 | 0.185259 | 18 | 1 | 1 | 3 | 13 | 0 | 1 | 1 | 3 | 1 | 0 | 0 | 12 | 0 | 11 | 5 | 2 | 0.611111 | 7.783226 | 21.615131 | 0.200140 | 0.011119 | 0.011119 | 0.033357 | 0.144546 | 0.000000 | 0.011119 | 0.011119 | 0.033357 | 0.011119 | 0.000000 | 0.000000 | 0.133427 | 0.0 | 7.983366 | 0.0 | 0.026706 | 0.616612 | 0.531121 | 0.652815 | 0.505263 | 0.088743 | 0.129022 | 0.139861 | 1.035771e-07 | 0.018349 | 0.069231 | 27.888889 | 2.100409e-12 | 21.003141 | 0.000000 | 4.051075 | 16.952065 | 92.364304 | 0.000000 | 6.755727 | 13.511455 | 101.335910 | 6.755727 | 692.02 | 0.15082 | 0.597189 | 2.00 | 13.511455 | 0.001029 | 21512.0 | 25326.0 | 162.0 | 494.0 | 0.002513 | 0.486455 | 0.002775 | 0.549741 | 0.000050 | 0.002771 | 0.000186 | 0.010889 | 1.0 | 8.0 | 14.0 | 14.0 | 10.833333 | Santa Rosa-Petaluma, CA Metro Area | 636 | 502 | 1741 | 1295.36 | 1306.0 | 1217 | 915 | 93 | 166 | 243 | 0.611111 | 0.277778 | 0.111111 | 342 | 2.172844 | 0.182438 | 0.00000 | 2.887705 | 0 | 0 | 3 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.009743 | 33.743783 | 37.753526 | 18.791349 | 72.049792 | 39.60906 | 33.638392 | 8745.981842 | 64.395923 | 2853.102115 | 3.833834e+05 | MULTIPOLYGON (((-2281109.446 2023606.289, -228... | Sonoma County | Bay Area |
print(california_data.shape)
(13471, 184)
Data Transofrmation¶
Work with CSV format for Modeling¶
#california_data.to_csv('C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/cal_data_delete.csv', index=False)
model_data = pd.read_csv('C:/Users/rdn91/OneDrive/Desktop/Coding/Python_Projects/california_counties/data/cal_data_delete.csv')
model_data.drop(['Region', 'Shape_Length', 'Shape_Area', 'geometry'], axis=1, inplace=True)
print(model_data.shape)
model_data.head()
(13471, 179)
| GEOID10 | GEOID20 | STATEFP | COUNTYFP | TRACTCE | BLKGRPCE | CSA | CSA_Name | CBSA | CBSA_Name | CBSA_POP | CBSA_EMP | CBSA_WRK | Ac_Total | Ac_Water | Ac_Land | Ac_Unpr | TotPop | CountHU | HH | P_WrkAge | AutoOwn0 | Pct_AO0 | AutoOwn1 | Pct_AO1 | AutoOwn2p | Pct_AO2p | Workers | R_LowWageWk | R_MedWageWk | R_HiWageWk | R_PCTLOWWAGE | TotEmp | E5_Ret | E5_Off | E5_Ind | E5_Svc | E5_Ent | E8_Ret | E8_off | E8_Ind | E8_Svc | E8_Ent | E8_Ed | E8_Hlth | E8_Pub | E_LowWageWk | E_MedWageWk | E_HiWageWk | E_PctLowWage | D1A | D1B | D1C | D1C5_RET | D1C5_OFF | D1C5_IND | D1C5_SVC | D1C5_ENT | D1C8_RET | D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_ENT | D1C8_ED | D1C8_HLTH | D1C8_PUB | D1D | D1_FLAG | D2A_JPHH | D2B_E5MIX | D2B_E5MIXA | D2B_E8MIX | D2B_E8MIXA | D2A_EPHHM | D2C_TRPMX1 | D2C_TRPMX2 | D2C_TRIPEQ | D2R_JOBPOP | D2R_WRKEMP | D2A_WRKEMP | D2C_WREMLX | D3A | D3AAO | D3AMM | D3APO | D3B | D3BAO | D3BMM3 | D3BMM4 | D3BPO3 | D3BPO4 | D4A | D4B025 | D4B050 | D4C | D4D | D4E | D5AR | D5AE | D5BR | D5BE | D5CR | D5CRI | D5CE | D5CEI | D5DR | D5DRI | D5DE | D5DEI | D2A_Ranked | D2B_Ranked | D3B_Ranked | D4A_Ranked | NatWalkInd | Households | Workers_1 | Residents | Drivers | White | Male | Lowwage | Medwage | Highwage | W_P_Lowwage | W_P_Medwage | W_P_Highwage | GasPrice | logd1a | logd1c | logd3aao | logd3apo | d4bo25 | d5dei_1 | logd4d | UPTpercap | B_C_constant | B_C_male | B_C_ld1c | B_C_drvmveh | B_C_ld1a | B_C_ld3apo | B_C_inc1 | B_C_gasp | B_N_constant | B_N_inc2 | B_N_inc3 | B_N_white | B_N_male | B_N_drvmveh | B_N_gasp | B_N_ld1a | B_N_ld1c | B_N_ld3aao | B_N_ld3apo | B_N_d4bo25 | B_N_d5dei | B_N_UPTpc | C_R_Households | C_R_Pop | C_R_Workers | C_R_Drivers | C_R_Vehicles | C_R_White | C_R_Male | C_R_Lowwage | C_R_Medwage | C_R_Highwage | C_R_DrmV | NonCom_VMT_Per_Worker | Com_VMT_Per_Worker | VMT_per_worker | VMT_tot_min | VMT_tot_max | VMT_tot_avg | GHG_per_worker | Annual_GHG | SLC_score | COUNTY_NAME | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60971513084 | 60971513084 | 6 | 97 | 151308 | 4 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 67.135587 | 0.0 | 67.135587 | 60.291730 | 396 | 169 | 131 | 0.490 | 0 | 0.000000 | 10 | 0.076336 | 121 | 0.923664 | 276 | 46 | 78 | 152 | 0.166667 | 30 | 4 | 0 | 16 | 10 | 0 | 4 | 0 | 16 | 9 | 0 | 0 | 1 | 0 | 6 | 9 | 15 | 0.200000 | 2.803038 | 6.568065 | 0.497581 | 0.066344 | 0.000000 | 0.265376 | 0.165860 | 0.000000 | 0.066344 | 0.000000 | 0.265376 | 0.149274 | 0.000000 | 0.000000 | 0.016586 | 0.0 | 3.300619 | 0.0 | 0.229008 | 0.883037 | 0.602767 | 0.777957 | 0.518638 | 0.477267 | 0.397559 | 0.395591 | 9.525833e-02 | 0.140845 | 0.196078 | 9.200000 | 2.746536e-04 | 22.454792 | 0.000000 | 0.064477 | 22.390315 | 149.409879 | 0.000000 | 0.000000 | 0.000000 | 181.125994 | 28.598841 | 615.57 | 0.00000 | 0.000000 | 0.67 | 6.387075 | 0.001692 | 20817.0 | 23238.0 | 3423.0 | 4467.0 | 0.002432 | 0.470739 | 0.002547 | 0.504417 | 0.001056 | 0.058551 | 0.001686 | 0.098464 | 9.0 | 9.0 | 17.0 | 15.0 | 13.666667 | 160 | 276 | 464 | 273.68 | 324 | 244 | 46 | 78 | 152 | 0.200000 | 0.300000 | 0.500000 | 342 | 1.335800 | 0.403851 | 0.00000 | 3.152322 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.586229 | 36.685037 | 41.271266 | 18.791349 | 72.049792 | 39.60906 | 36.772698 | 9560.901540 | 57.790885 | Sonoma County | Bay Area |
| 1 | 60971513092 | 60971513092 | 6 | 97 | 151309 | 2 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 570.037214 | 0.0 | 570.037214 | 471.335950 | 2560 | 896 | 869 | 0.593 | 8 | 0.009206 | 200 | 0.230150 | 661 | 0.760644 | 1406 | 289 | 358 | 759 | 0.205548 | 63 | 1 | 0 | 24 | 36 | 2 | 1 | 0 | 24 | 13 | 2 | 5 | 18 | 0 | 20 | 19 | 24 | 0.317460 | 1.900980 | 5.431370 | 0.133663 | 0.002122 | 0.000000 | 0.050919 | 0.076379 | 0.004243 | 0.002122 | 0.000000 | 0.050919 | 0.027581 | 0.004243 | 0.010608 | 0.038189 | 0.0 | 2.034642 | 0.0 | 0.072497 | 0.622319 | 0.536037 | 0.796767 | 0.686537 | 0.189938 | 0.234089 | 0.244020 | 1.481149e-03 | 0.048037 | 0.085773 | 22.317460 | 5.520070e-10 | 9.245028 | 1.434447 | 1.089662 | 6.720919 | 42.302361 | 4.490935 | 1.122734 | 2.245467 | 37.050213 | 14.595538 | 742.98 | 0.00000 | 0.000000 | 0.67 | 0.752232 | 0.000262 | 23024.0 | 26141.0 | 265.0 | 2111.0 | 0.002689 | 0.520646 | 0.002865 | 0.567431 | 0.000082 | 0.004533 | 0.000797 | 0.046532 | 3.0 | 15.0 | 9.0 | 14.0 | 10.666667 | 966 | 1406 | 2882 | 2041.60 | 1853 | 1286 | 289 | 358 | 759 | 0.317460 | 0.301587 | 0.380952 | 342 | 1.065048 | 0.125454 | 0.88972 | 2.043933 | 0 | 0 | 1 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 6.306880 | 45.213616 | 51.520496 | 18.791349 | 72.049792 | 39.60906 | 45.904762 | 11935.238176 | 38.546556 | Sonoma County | Bay Area |
| 2 | 60971513093 | 60971513093 | 6 | 97 | 151309 | 3 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 982.997406 | 0.0 | 982.997406 | 982.846330 | 1610 | 712 | 648 | 0.574 | 77 | 0.118827 | 187 | 0.288580 | 384 | 0.592593 | 665 | 149 | 171 | 345 | 0.224060 | 282 | 0 | 0 | 31 | 245 | 6 | 0 | 0 | 31 | 5 | 6 | 61 | 179 | 0 | 83 | 104 | 95 | 0.294326 | 0.724427 | 1.638099 | 0.286922 | 0.000000 | 0.000000 | 0.031541 | 0.249276 | 0.006105 | 0.000000 | 0.000000 | 0.031541 | 0.005087 | 0.006105 | 0.062065 | 0.182124 | 0.0 | 1.011348 | 0.0 | 0.435185 | 0.406720 | 0.277630 | 0.631164 | 0.488506 | 0.540334 | 0.441643 | 0.477021 | 9.053520e-01 | 0.298097 | 0.595565 | 2.358156 | 2.571345e-01 | 6.367728 | 0.795925 | 0.849893 | 4.721909 | 21.711227 | 1.302140 | 4.557489 | 0.000000 | 22.136376 | 3.906419 | 1098.38 | 0.00000 | 0.000000 | 0.67 | 0.436217 | 0.000416 | 22544.0 | 25770.0 | 368.0 | 1478.0 | 0.002633 | 0.509792 | 0.002824 | 0.559378 | 0.000114 | 0.006295 | 0.000558 | 0.032579 | 11.0 | 8.0 | 7.0 | 13.0 | 9.833333 | 660 | 665 | 1722 | 1156.32 | 1283 | 647 | 149 | 171 | 345 | 0.294326 | 0.368794 | 0.336879 | 342 | 0.544895 | 0.252253 | 0.58552 | 1.744303 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 7.067722 | 45.780375 | 52.848097 | 18.791349 | 72.049792 | 39.60906 | 47.087654 | 12242.790059 | 36.053805 | Sonoma County | Bay Area |
| 3 | 60971513101 | 60971513101 | 6 | 97 | 151310 | 1 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 51.480814 | 0.0 | 51.480814 | 51.080456 | 506 | 199 | 182 | 0.709 | 0 | 0.000000 | 35 | 0.192308 | 147 | 0.807692 | 268 | 63 | 73 | 132 | 0.235075 | 142 | 14 | 0 | 8 | 19 | 101 | 14 | 0 | 8 | 13 | 101 | 0 | 6 | 0 | 96 | 37 | 9 | 0.676056 | 3.895815 | 9.905941 | 2.779928 | 0.274077 | 0.000000 | 0.156616 | 0.371962 | 1.977273 | 0.274077 | 0.000000 | 0.156616 | 0.254500 | 1.977273 | 0.000000 | 0.117462 | 0.0 | 6.675743 | 0.0 | 0.780220 | 0.650603 | 0.560399 | 0.612247 | 0.473864 | 0.671536 | 0.565885 | 0.621614 | 5.384277e-01 | 0.438272 | 0.692683 | 1.887324 | 4.117562e-01 | 27.510900 | 0.000000 | 9.572344 | 17.938556 | 116.063433 | 0.000000 | 12.431816 | 24.863632 | 87.022711 | 24.863632 | 580.70 | 0.00000 | 0.479764 | 2.00 | 24.863632 | 0.003953 | 21597.0 | 25470.0 | 185.0 | 1195.0 | 0.002523 | 0.488377 | 0.002791 | 0.552866 | 0.000057 | 0.003164 | 0.000451 | 0.026341 | 15.0 | 7.0 | 15.0 | 15.0 | 13.666667 | 200 | 268 | 552 | 383.68 | 317 | 190 | 63 | 73 | 132 | 0.676056 | 0.260563 | 0.063380 | 342 | 1.588381 | 1.329705 | 0.00000 | 2.941200 | 0 | 0 | 3 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 3.675845 | 24.646966 | 28.322812 | 18.791349 | 72.049792 | 39.60906 | 25.235625 | 6561.262585 | 82.103377 | Sonoma County | Bay Area |
| 4 | 60971513102 | 60971513102 | 6 | 97 | 151310 | 2 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 94.734433 | 0.0 | 94.734433 | 89.936999 | 1944 | 700 | 674 | 0.653 | 68 | 0.100890 | 97 | 0.143917 | 509 | 0.755193 | 502 | 93 | 166 | 243 | 0.185259 | 18 | 1 | 1 | 3 | 13 | 0 | 1 | 1 | 3 | 1 | 0 | 0 | 12 | 0 | 11 | 5 | 2 | 0.611111 | 7.783226 | 21.615131 | 0.200140 | 0.011119 | 0.011119 | 0.033357 | 0.144546 | 0.000000 | 0.011119 | 0.011119 | 0.033357 | 0.011119 | 0.000000 | 0.000000 | 0.133427 | 0.0 | 7.983366 | 0.0 | 0.026706 | 0.616612 | 0.531121 | 0.652815 | 0.505263 | 0.088743 | 0.129022 | 0.139861 | 1.035771e-07 | 0.018349 | 0.069231 | 27.888889 | 2.100409e-12 | 21.003141 | 0.000000 | 4.051075 | 16.952065 | 92.364304 | 0.000000 | 6.755727 | 13.511455 | 101.335910 | 6.755727 | 692.02 | 0.15082 | 0.597189 | 2.00 | 13.511455 | 0.001029 | 21512.0 | 25326.0 | 162.0 | 494.0 | 0.002513 | 0.486455 | 0.002775 | 0.549741 | 0.000050 | 0.002771 | 0.000186 | 0.010889 | 1.0 | 8.0 | 14.0 | 14.0 | 10.833333 | 636 | 502 | 1741 | 1295.36 | 1217 | 915 | 93 | 166 | 243 | 0.611111 | 0.277778 | 0.111111 | 342 | 2.172844 | 0.182438 | 0.00000 | 2.887705 | 0 | 0 | 3 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.009743 | 33.743783 | 37.753526 | 18.791349 | 72.049792 | 39.60906 | 33.638392 | 8745.981842 | 64.395923 | Sonoma County | Bay Area |
model_data['GEOID20'].is_unique
True
# Check distinct values for suspicious columns
model_data['target'].value_counts().sort_values(ascending=False) #UPTpercap, Ac_Water
target Southern California 8968 Bay Area 2517 Central California 1737 Northern California 249 Name: count, dtype: int64
# Separate columns
description_cols = model_data.columns[:10].tolist() + ['COUNTY_NAME']
input_cols = model_data.columns[10:-2].tolist()
target_col = ['target']
print(description_cols)
print(input_cols)
print(target_col)
['GEOID10', 'GEOID20', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'CSA', 'CSA_Name', 'CBSA', 'CBSA_Name', 'COUNTY_NAME'] ['CBSA_POP', 'CBSA_EMP', 'CBSA_WRK', 'Ac_Total', 'Ac_Water', 'Ac_Land', 'Ac_Unpr', 'TotPop', 'CountHU', 'HH', 'P_WrkAge', 'AutoOwn0', 'Pct_AO0', 'AutoOwn1', 'Pct_AO1', 'AutoOwn2p', 'Pct_AO2p', 'Workers', 'R_LowWageWk', 'R_MedWageWk', 'R_HiWageWk', 'R_PCTLOWWAGE', 'TotEmp', 'E5_Ret', 'E5_Off', 'E5_Ind', 'E5_Svc', 'E5_Ent', 'E8_Ret', 'E8_off', 'E8_Ind', 'E8_Svc', 'E8_Ent', 'E8_Ed', 'E8_Hlth', 'E8_Pub', 'E_LowWageWk', 'E_MedWageWk', 'E_HiWageWk', 'E_PctLowWage', 'D1A', 'D1B', 'D1C', 'D1C5_RET', 'D1C5_OFF', 'D1C5_IND', 'D1C5_SVC', 'D1C5_ENT', 'D1C8_RET', 'D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_ENT', 'D1C8_ED', 'D1C8_HLTH', 'D1C8_PUB', 'D1D', 'D1_FLAG', 'D2A_JPHH', 'D2B_E5MIX', 'D2B_E5MIXA', 'D2B_E8MIX', 'D2B_E8MIXA', 'D2A_EPHHM', 'D2C_TRPMX1', 'D2C_TRPMX2', 'D2C_TRIPEQ', 'D2R_JOBPOP', 'D2R_WRKEMP', 'D2A_WRKEMP', 'D2C_WREMLX', 'D3A', 'D3AAO', 'D3AMM', 'D3APO', 'D3B', 'D3BAO', 'D3BMM3', 'D3BMM4', 'D3BPO3', 'D3BPO4', 'D4A', 'D4B025', 'D4B050', 'D4C', 'D4D', 'D4E', 'D5AR', 'D5AE', 'D5BR', 'D5BE', 'D5CR', 'D5CRI', 'D5CE', 'D5CEI', 'D5DR', 'D5DRI', 'D5DE', 'D5DEI', 'D2A_Ranked', 'D2B_Ranked', 'D3B_Ranked', 'D4A_Ranked', 'NatWalkInd', 'Households', 'Workers_1', 'Residents', 'Drivers', 'White', 'Male', 'Lowwage', 'Medwage', 'Highwage', 'W_P_Lowwage', 'W_P_Medwage', 'W_P_Highwage', 'GasPrice', 'logd1a', 'logd1c', 'logd3aao', 'logd3apo', 'd4bo25', 'd5dei_1', 'logd4d', 'UPTpercap', 'B_C_constant', 'B_C_male', 'B_C_ld1c', 'B_C_drvmveh', 'B_C_ld1a', 'B_C_ld3apo', 'B_C_inc1', 'B_C_gasp', 'B_N_constant', 'B_N_inc2', 'B_N_inc3', 'B_N_white', 'B_N_male', 'B_N_drvmveh', 'B_N_gasp', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3aao', 'B_N_ld3apo', 'B_N_d4bo25', 'B_N_d5dei', 'B_N_UPTpc', 'C_R_Households', 'C_R_Pop', 'C_R_Workers', 'C_R_Drivers', 'C_R_Vehicles', 'C_R_White', 'C_R_Male', 'C_R_Lowwage', 'C_R_Medwage', 'C_R_Highwage', 'C_R_DrmV', 'NonCom_VMT_Per_Worker', 'Com_VMT_Per_Worker', 'VMT_per_worker', 'VMT_tot_min', 'VMT_tot_max', 'VMT_tot_avg', 'GHG_per_worker', 'Annual_GHG', 'SLC_score'] ['target']
Check correlations manually to detect weirdness/problems¶
all_cols = []
delete_cols = []
random_elements = {}
threshold = 0.7
counter = 0
while counter < 2:
all_cols.extend(delete_cols)
all_cols.extend(random_elements)
X_copy = model_data[input_cols].drop(columns=all_cols)
corr_matrix = X_copy.corr()
# Find pairs with high correlation
high_corr_pairs = {}
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > threshold:
high_corr_pairs[(corr_matrix.columns[i], corr_matrix.columns[j])] = corr_matrix.iloc[i, j]
# Separate columns and count how often they appear
high_corr_cols = list(high_corr_pairs.keys()) #need to unpack tuples and delete duplicates
high_corr_cols = [value for item in high_corr_cols for value in item] #<https://stackoverflow.com/questions/10632839/transform-list-of-tuples-into-a-flat-list-or-a-matrix>
counter_dict = Counter(high_corr_cols)
keys_delete = counter_dict.keys()
keys_delete = list(set(keys_delete))
high_corr_pairs_copy = {k: v for k, v in high_corr_pairs.items() if all(i in keys_delete for i in k)}
delete_cols = [tool for tool, value in counter_dict.items() if value > 1]
if high_corr_pairs_copy:
for key in high_corr_pairs_copy:
random_elements = [random.choice(key) for key in high_corr_pairs_copy]
print(counter_dict)
print(delete_cols)
print(high_corr_pairs_copy)
print(random_elements)
#Update Counter
if not delete_cols or random_elements:
counter += 1
X_copy.columns.tolist()
# Check again
corr_matrix = X_copy.corr()
threshold = 0.7
# Find pairs with high correlation
high_corr_pairs = {}
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > threshold:
high_corr_pairs[(corr_matrix.columns[i], corr_matrix.columns[j])] = corr_matrix.iloc[i, j]
# Print something if high correlation pairs are/aren't found
if high_corr_pairs:
print(high_corr_pairs)
# Print out high correlation pairs
print("Pairs with high correlation (|correlation| > {:.2f}):".format(threshold))
for pair, corr_value in high_corr_pairs.items():
print(f"{pair[0]} and {pair[1]}: {corr_value:.2f}")
else:
print("There are no pairs with a correlation above the threshold: (|correlation| > {:.2f}):".format(threshold))
There are no pairs with a correlation above the threshold: (|correlation| > 0.70):
# Grab low corr columns and try to get rid of county-level variables
low_corr_cols = X_copy.columns.tolist()
county_lvl_cols = [s for s in low_corr_cols if s.startswith("C_R") or s.startswith("B_N") or s.startswith("B_C") or s.startswith("VMT_tot")]
county_lvl_cols.extend(('R_PCTLOWWAGE','GasPrice', 'UPTpercap', 'Ac_Water', 'D1_FLAG', 'd4bo25', 'd5dei_1', 'D4A_Ranked','D4B050' )) # UPTpercap, Ac_water, D1_FLAG
low_corr_cols = [x for x in low_corr_cols if x not in county_lvl_cols]
# Encode and reshape data
X = model_data[low_corr_cols]
y = model_data[target_col]
num_target = LabelEncoder()
y = num_target.fit_transform(y).flatten()
#Check results
print(county_lvl_cols)
print(X.shape)
print(y.shape)
['B_C_male', 'B_C_ld1c', 'B_C_ld1a', 'B_C_ld3apo', 'B_C_inc1', 'B_N_drvmveh', 'B_N_ld3apo', 'C_R_White', 'C_R_Male', 'C_R_DrmV', 'VMT_tot_max', 'VMT_tot_avg', 'R_PCTLOWWAGE', 'GasPrice', 'UPTpercap', 'Ac_Water', 'D1_FLAG', 'd4bo25', 'd5dei_1', 'D4A_Ranked', 'D4B050'] (13471, 33) (13471,)
c:\Users\rdn91\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_label.py:116: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
EDA¶
# Check for nulls in any column for each row
X[(X.isna()).all(axis=1)]
| Ac_Land | Ac_Unpr | P_WrkAge | AutoOwn0 | Pct_AO1 | E5_Ret | E8_Ent | E8_Ed | E8_Hlth | D1C5_ENT | D1C8_RET | D1C8_IND | D1C8_HLTH | D2A_JPHH | D2C_TRIPEQ | D2A_WRKEMP | D3AMM | D3BMM3 | D3BMM4 | D3BPO4 | D4A | D4C | D4D | D4E | D5BE | D5CEI | NatWalkInd | White | W_P_Medwage | logd1c | logd4d | NonCom_VMT_Per_Worker |
|---|
# Check for any negative values that don't make sense (column-wise) for each row
X[(X < 0).all(axis=1)]
| Ac_Land | Ac_Unpr | P_WrkAge | AutoOwn0 | Pct_AO1 | E5_Ret | E8_Ent | E8_Ed | E8_Hlth | D1C5_ENT | D1C8_RET | D1C8_IND | D1C8_HLTH | D2A_JPHH | D2C_TRIPEQ | D2A_WRKEMP | D3AMM | D3BMM3 | D3BMM4 | D3BPO4 | D4A | D4C | D4D | D4E | D5BE | D5CEI | NatWalkInd | White | W_P_Medwage | logd1c | logd4d | NonCom_VMT_Per_Worker |
|---|
X.columns
Index(['Ac_Total', 'Ac_Unpr', 'P_WrkAge', 'AutoOwn0', 'Pct_AO2p', 'E5_Ret',
'E8_Ent', 'E8_Hlth', 'D1C8_RET', 'D1C8_IND', 'D1C8_ENT', 'D1C8_ED',
'D1C8_HLTH', 'D1C8_PUB', 'D2A_JPHH', 'D2C_TRIPEQ', 'D2A_WRKEMP',
'D3AMM', 'D3BMM3', 'D3BMM4', 'D3BPO3', 'D3BPO4', 'D4A', 'D4C', 'D4D',
'D4E', 'D5BE', 'D5CEI', 'NatWalkInd', 'White', 'W_P_Medwage', 'logd1c',
'logd4d', 'NonCom_VMT_Per_Worker'],
dtype='object')
plot_cols(X, X.columns, 4, "histplot")
plot_cols(X, X.columns, 4, "kdeplot")
X.describe()
| Ac_Land | Ac_Unpr | P_WrkAge | AutoOwn0 | Pct_AO2p | E5_Ret | E5_Ent | E8_Hlth | D1C5_RET | D1C5_IND | D1C5_ENT | D1C8_ED | D1C8_HLTH | D2A_JPHH | D2C_TRIPEQ | D2A_WRKEMP | D3AMM | D3BMM3 | D3BMM4 | D3BPO4 | D4A | D4C | D4D | D4E | D5BR | D5CRI | NatWalkInd | White | W_P_Medwage | logd1c | logd4d | NonCom_VMT_Per_Worker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 1.347100e+04 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 1.347100e+04 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 | 13471.000000 |
| mean | 339.664921 | 250.658867 | 0.620634 | 48.916784 | 0.593864 | 82.285502 | 100.380744 | 124.242150 | 0.632095 | 0.737723 | 0.929933 | 0.523923 | 1.232386 | 2.208971 | 4.729318e-01 | 6.017315 | 3.878526 | 23.160982 | 14.113828 | 35.506695 | 460.697881 | 8.727682 | 97.765096 | 0.007959 | 1.789674e+05 | 0.495927 | 14.187860 | 522.830970 | 0.366496 | 1.336787 | 2.945438 | 2.692507 |
| std | 4116.565820 | 1310.140462 | 0.092262 | 82.042930 | 0.197057 | 252.685108 | 399.282251 | 459.071392 | 1.961353 | 2.917049 | 3.652411 | 4.896970 | 5.135792 | 27.981841 | 2.803600e-01 | 11.105327 | 3.499955 | 27.653597 | 20.140445 | 45.755758 | 256.464434 | 14.407550 | 299.399298 | 0.065520 | 2.112026e+05 | 0.217665 | 2.251361 | 479.981067 | 0.109384 | 0.916786 | 1.777173 | 1.158195 |
| min | 3.806823 | 3.806823 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.312035e-198 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.330000 | 0.000518 | 0.000000 | 5.000000e+00 | 0.000000 | 5.666667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.420682 |
| 25% | 64.009278 | 62.491096 | 0.567000 | 8.000000 | 0.464562 | 0.000000 | 0.000000 | 17.000000 | 0.000000 | 0.029345 | 0.000000 | 0.000000 | 0.139561 | 0.193846 | 2.859821e-01 | 1.086739 | 1.450393 | 2.039375 | 0.000000 | 7.864765 | 274.930000 | 2.000000 | 8.747939 | 0.001367 | 2.466050e+04 | 0.352438 | 12.666667 | 150.000000 | 0.296687 | 0.632475 | 2.000000 | 1.878572 |
| 50% | 106.410403 | 102.778227 | 0.616000 | 25.000000 | 0.612903 | 13.000000 | 18.000000 | 37.000000 | 0.111695 | 0.123894 | 0.149405 | 0.003743 | 0.362378 | 0.452349 | 4.870922e-01 | 2.900000 | 3.227179 | 15.239382 | 7.966047 | 20.327148 | 426.480000 | 4.670000 | 25.189643 | 0.003074 | 9.063600e+04 | 0.522043 | 14.166667 | 414.000000 | 0.370327 | 1.159634 | 3.000000 | 2.443149 |
| 75% | 183.090845 | 174.740024 | 0.670000 | 60.000000 | 0.744975 | 63.000000 | 84.000000 | 89.000000 | 0.538239 | 0.455436 | 0.708771 | 0.365651 | 0.902096 | 1.157211 | 6.777726e-01 | 6.929154 | 5.413592 | 33.179379 | 18.999462 | 45.465900 | 618.260000 | 10.000000 | 71.899564 | 0.007014 | 2.753380e+05 | 0.648365 | 15.833333 | 762.000000 | 0.437679 | 1.859992 | 4.000000 | 3.319817 |
| max | 407868.018652 | 61946.753520 | 1.000000 | 1710.000000 | 1.000000 | 5780.000000 | 27592.000000 | 12572.000000 | 84.893696 | 121.727268 | 139.624405 | 376.807057 | 326.152342 | 1862.888889 | 9.995461e-01 | 414.000000 | 41.753275 | 357.134925 | 432.828295 | 671.062217 | 1207.000000 | 402.330000 | 17191.471949 | 5.734000 | 1.193711e+06 | 1.000000 | 20.000000 | 6371.000000 | 1.000000 | 6.644192 | 10.000000 | 10.911256 |
Feature Selection¶
print(X.shape, y.shape)
(13471, 32) (13471,)
# Split data into training, validation, and test --> 70-15-15 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, stratify=y)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)
print("Original data shape:", X.shape, "\nOutput:", y.shape, "\n")
print("Shape of training dataset: ", "\n", "Input Features:", X_train.shape, "\n", "Output:", y_train.shape)
print("\nShape of validation dataset: ", "\n", "Input Features:", X_validation.shape, "\n", "Output:", y_validation.shape)
print("\nShape of testing dataset: ", "\n", "Input Features:", X_test.shape, "\n", "Output:", y_test.shape)
Original data shape: (13471, 32) Output: (13471,) Shape of training dataset: Input Features: (9429, 32) Output: (9429,) Shape of validation dataset: Input Features: (2021, 32) Output: (2021,) Shape of testing dataset: Input Features: (2021, 32) Output: (2021,)
# Scale the data
scaler = StandardScaler().set_output(transform="pandas")
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
# Permutation Importants using Random Forest
rf_feature = RandomForestClassifier(n_estimators=100)
rf_feature.fit(X_scaled, y)
importances = rf_feature.feature_importances_
#std = np.std([RF.feature_importances_ for tree in RF.estimators_], axis=0)
idf = pd.DataFrame({'feature': X_scaled.columns, 'importance': importances})
#idf = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
idf = idf.sort_values('importance',ascending=False)
print(idf)
ax = idf.plot(kind='barh', x='feature', legend=False)
ax.set_ylabel('')
feature importance 31 NonCom_VMT_Per_Worker 0.248721 25 D5CRI 0.058175 24 D5BR 0.052645 4 Pct_AO2p 0.038953 27 White 0.038428 22 D4D 0.038009 1 Ac_Unpr 0.035215 0 Ac_Land 0.033485 19 D3BPO4 0.030505 28 W_P_Medwage 0.027864 29 logd1c 0.027232 26 NatWalkInd 0.024348 12 D1C8_HLTH 0.024054 21 D4C 0.022016 23 D4E 0.021928 2 P_WrkAge 0.021344 16 D3AMM 0.020406 20 D4A 0.020170 3 AutoOwn0 0.020124 15 D2A_WRKEMP 0.019464 9 D1C5_IND 0.019160 7 E8_Hlth 0.017141 17 D3BMM3 0.016151 13 D2A_JPHH 0.015960 10 D1C5_ENT 0.015623 14 D2C_TRIPEQ 0.015366 18 D3BMM4 0.015231 8 D1C5_RET 0.013872 30 logd4d 0.013360 6 E5_Ent 0.012291 5 E5_Ret 0.011754 11 D1C8_ED 0.011006
Text(0, 0.5, '')
# Testing Random Forest Feature Importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y)
importances_rf = rf_model.feature_importances_
# Select the top 5 features based on feature importance
top_5_rf = np.argsort(importances_rf)[-5:] # Get indices of top 5 features
selected_features_rf = X_scaled.columns[top_5_rf]
X_scaled = X_scaled[selected_features_rf]
X_scaled
| White | Pct_AO2p | D5BR | D5CRI | NonCom_VMT_Per_Worker | |
|---|---|---|---|---|---|
| 0 | -0.414263 | 1.673694 | -0.831197 | -0.115725 | 1.635124 |
| 1 | 2.771398 | 0.846390 | -0.846150 | 0.113568 | 3.120810 |
| 2 | 1.583807 | -0.006452 | -0.845662 | 0.063699 | 3.777755 |
| 3 | -0.428847 | 1.085152 | -0.846528 | -0.034688 | 0.849058 |
| 4 | 1.446296 | 0.818724 | -0.846637 | -0.043519 | 1.137360 |
| ... | ... | ... | ... | ... | ... |
| 13466 | -0.124657 | 0.200302 | -0.701804 | -1.922919 | -0.145962 |
| 13467 | 0.119111 | 0.624003 | -0.554665 | -0.415139 | -0.612506 |
| 13468 | 0.083692 | 1.415196 | -0.511691 | -0.861541 | -0.386401 |
| 13469 | 0.171199 | 1.350357 | -0.569950 | -1.045051 | -0.193661 |
| 13470 | 0.092026 | 0.511127 | -0.521359 | -0.773360 | -0.424307 |
13471 rows × 5 columns
Supervised Learning¶
# Do K-Fold Cross Validation
pipeline = Pipeline ([
('smote', BorderlineSMOTE(sampling_strategy='not majority', m_neighbors=100, k_neighbors=100)),
('scale', StandardScaler()),
('rf', RandomForestClassifier()) #no combination of hyperparameters led to good increase in f1 score, just focus on feature engineering and not hyperparameter tuning
])
param_grid = {
}
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1,scoring='f1_weighted', refit=True)
grid_data = grid.fit(X_train, y_train)
results = pd.DataFrame(grid_data.cv_results_)
results.sort_values(by=['rank_test_score'])
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11.355697 | 0.150327 | 0.028233 | 0.004922 | {} | 0.858138 | 0.861453 | 0.844702 | 0.859861 | 0.860895 | 0.85701 | 0.006256 | 1 |
# Do K-Fold Cross Validation
pipeline = Pipeline ([
('smote', BorderlineSMOTE(sampling_strategy='not majority', m_neighbors=100, k_neighbors=100)),
('scale', StandardScaler()),
('xgb', XGBClassifier(objective='multi:softprob', colsample_bytree=0.9, eta=0.6)) #no combination of hyperparameters led to good increase in f1 score, just focus on feature engineering and not hyperparameter tuning
])
param_grid = {
}
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1,scoring='f1_weighted', refit=True)
grid_data = grid.fit(X_train, y_train)
results = pd.DataFrame(grid_data.cv_results_)
results.sort_values(by=['rank_test_score'])
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.107797 | 0.032659 | 0.01853 | 0.001632 | {} | 0.89548 | 0.911367 | 0.898752 | 0.912347 | 0.908622 | 0.905314 | 0.006882 | 1 |
#<https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html>
# Plot non-normalized confusion matrix
titles_options = [
("Confusion matrix, without normalization", None),
("Normalized confusion matrix", "true"), # true --> sum of true values (rowwise total sum)
]
for title, normalize in titles_options:
disp = ConfusionMatrixDisplay.from_estimator(
grid_data,
X_validation,
y_validation,
display_labels=num_target.inverse_transform([0,1,2,3]),
cmap=plt.cm.Blues,
normalize=normalize,
)
disp.ax_.set_title(title)
fig = disp.figure_
fig.set_figwidth(10)
fig.set_figheight(10)
print(title)
print(disp.confusion_matrix)
plt.show()
# F1-score
y_pred = grid_data.predict(X_validation)
print(classification_report(y_validation, y_pred, digits = 4))
Confusion matrix, without normalization [[ 328 32 1 16] [ 28 212 4 17] [ 0 3 32 2] [ 19 31 6 1290]] Normalized confusion matrix [[0.87002653 0.08488064 0.00265252 0.04244032] [0.10727969 0.81226054 0.01532567 0.0651341 ] [0. 0.08108108 0.86486486 0.05405405] [0.0141159 0.0230312 0.00445765 0.95839525]]
precision recall f1-score support
0 0.8747 0.8700 0.8723 377
1 0.7626 0.8123 0.7866 261
2 0.7442 0.8649 0.8000 37
3 0.9736 0.9584 0.9659 1346
accuracy 0.9213 2021
macro avg 0.8388 0.8764 0.8562 2021
weighted avg 0.9237 0.9213 0.9223 2021
# Test data
titles_options = [
("Confusion matrix, without normalization", None),
("Normalized confusion matrix", "true"), # true --> sum of true values (rowwise total sum)
]
for title, normalize in titles_options:
disp = ConfusionMatrixDisplay.from_estimator(
grid_data,
X_test,
y_test,
display_labels=num_target.inverse_transform([0,1,2,3]),
cmap=plt.cm.Blues,
normalize=normalize,
)
disp.ax_.set_title(title)
fig = disp.figure_
fig.set_figwidth(10)
fig.set_figheight(10)
print(title)
print(disp.confusion_matrix)
plt.show()
# F1-score
y_pred = grid_data.predict(X_test)
print(classification_report(y_test, y_pred, digits = 4))
# What does each label mean
print(y.dtype)
print(num_target.inverse_transform([0,1,2,3]))
print(Counter(y))
int32
['Bay Area' 'Central California' 'Northern California'
'Southern California']
Counter({3: 8968, 0: 2517, 1: 1737, 2: 249})
Clustering¶
Dimensionality Reduction/Feature Extraction Methods¶
# For graphing 2d
def plot_scatter_with_adjusted_labels(data, method_name,y):
plt.figure(figsize=(10, 7))
label_dict = {0: 'Bay Area',
1: 'Central California',
2: 'Northern California',
3: 'Southern California'
}
scatter = plt.scatter(data[:, -1], data[:, :-1], c=y, cmap='viridis')
plt.title(f'{method_name} Scatter Plot with First Two Components')
plt.xlabel(f'First {method_name} Dimension')
plt.ylabel(f'Second {method_name} Dimension')
# Adding all labels, with adjustText for avoiding overlap
#texts = [plt.text(data[i, 0], data[i, 1], str(y.iloc[i]), fontsize=9) for i in range(len(y))]
# Adjust the text labels to avoid overlaps
#adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))
plt.colorbar(scatter)
unique_labels = np.unique(y)
handles = [plt.Line2D([0], [0], marker='o', color='w',
markerfacecolor=scatter.cmap(scatter.norm(label)), markersize=10)
for label in unique_labels]
plt.legend(handles, [label_dict[label] for label in unique_labels], title="Regions", loc="best")
plt.show()
# For graphing 3d
def plot_scatter_with_adjusted_labels(data, method_name, y):
fig = go.Figure(data=[go.Scatter3d(
x=data[:, 0],
y=data[:, 1],
z=data[:, 2],
mode='markers',
marker=dict(color=y,
colorscale='Viridis',
showscale=True,
colorbar=dict(
title='Region',
ticktext=['Bay Area', 'Central California', 'Northern California','Southern California'],
tickvals=[0,1,2,3],
ticks='outside'
)
),
text=y
)])
fig.update_layout(
title=f'{method_name} Scatter Plot with First Three Components',
scene=dict(
xaxis_title=f'First {method_name} Dimension',
yaxis_title=f'Second {method_name} Dimension',
zaxis_title=f'Third {method_name} Dimension'
),
width=1200,
height=800
)
fig.update_layout(
dragmode='drawrect',
)
fig.show(config=dict(displayModeBar=True))
Undersampling/Oversampling¶
over_sample = BorderlineSMOTE(sampling_strategy='not majority', m_neighbors=200, k_neighbors=200)
X_scaled_copy, y_copy = over_sample.fit_resample(X_scaled, y)
print(Counter(y_copy))
Counter({0: 8968, 3: 8968, 1: 8968, 2: 8968})
PCA¶
pca_result = PCA(n_components=2).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(pca_result, 'PCA',y_copy)
MDS¶
mds_result = MDS(n_components=2, random_state=10, n_jobs=-1).fit_transform(X)
plot_scatter_with_adjusted_labels(mds_result, 'MDS',y)
Takes too long
IsoMap¶
isomap_result = Isomap(n_neighbors=500,n_components=2).fit_transform(X)
plot_scatter_with_adjusted_labels(isomap_result, 'Isomap',y)
Takes too long
LLE¶
Will take too long
t-SNE (Oversampling)¶
# High corr threshold with 'not-majority' sampling
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
# Medium corr threshold with 'not-majority' sampling
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
# Low corr threshold with 'not-majority' sampling
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
BorderlineSmote t-SNE (Med. Corr Threshold)¶
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10, n_jobs=-1).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
BorderlineSmote t-SNE (Med. Corr Threshold and Top 5 Features after Random Forest)¶
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10, n_jobs=-1).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
t-SNE (Undersampling)¶
# Low corr threshold with 'not-minority' sampling
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
# Medium corr threshold with 'not-minority' sampling
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
# High corr threshold with 'not-minority' sampling
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
# Testing
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled_copy)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y_copy)
t-SNE on original data (few columns vs a lot)¶
# Low correlation threshold (0.5)
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y)
# Med correlation threshold (0.7)
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y)
# High correlation threshold (0.9)
tsne_result = TSNE(n_components=2, perplexity=5, random_state=10).fit_transform(X_scaled)
plot_scatter_with_adjusted_labels(tsne_result, 't-SNE',y)
Clustering (Agglomerative and Hierarchical)¶
dbscan = DBSCAN(eps=0.00001,metric='euclidean', min_samples=1)
labels = dbscan.fit_predict(tsne_result)
fig, ax = plt.subplots(5, 2, figsize=(30, 40))
# Dimensionality reduction plot
og_plot = ax[0,0].scatter(tsne_result[:, -1], tsne_result[:, :-1], c=y_copy, cmap='viridis')
ax[0,0].set_title('Data after tsne and Borderline Smote using original labels')
plt.colorbar(og_plot, ax=ax[0,0])
label_dict = {0: 'Bay Area',
1: 'Central California',
2: 'Northern California',
3: 'Southern California'
}
unique_labels = np.unique(y)
handles = [plt.Line2D([0], [0], marker='o', color='w',
markerfacecolor=og_plot.cmap(og_plot.norm(label)), markersize=10) for label in unique_labels]
ax[0,0].legend(handles, [label_dict[label] for label in unique_labels], title="Regions", loc="best")
# DBscan plot
cluster_plot = ax[0,1].scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels, cmap='viridis')
ax[0,1].set_title('Clustering with DBscan')
plt.colorbar(cluster_plot, ax=ax[0,1])
#Filtering greenish points
mask_green = (y_copy == 2)
green_plot = ax[1,0].scatter(tsne_result[mask_green, -1], tsne_result[mask_green, :-1], c='#22a884')
ax[1,0].set_title('Filtering the green values (Northern California)')
#Filtering yellowish points
mask_yellow = (y_copy == 3)
yellow_plot = ax[2,0].scatter(tsne_result[mask_yellow, -1], tsne_result[mask_yellow, :-1], c='#fde725')
ax[2,0].set_title('Filtering the yellow values (Southern California)')
#Filtering purpleish points
mask_purple = (y_copy == 0)
purple_plot = ax[3,0].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
ax[3,0].set_title('Filtering the purple values (Bay Area)')
#Filtering blueish points
mask_blue = (y_copy == 1)
blue_plot = ax[4,0].scatter(tsne_result[mask_blue, -1], tsne_result[mask_blue, :-1], c='#2a788e')
ax[4,0].set_title('Filtering the blue values (Central California)')
#Compare Central California and Bay Area
purple_plot = ax[4,1].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
blue_plot = ax[4,1].scatter(tsne_result[mask_blue, -1], tsne_result[mask_blue, :-1], c='#2a788e')
ax[4,1].set_title('Comparing Central California and Bay Area')
#Compare Northern California and Central California
green_plot = ax[1,1].scatter(tsne_result[mask_green, -1], tsne_result[mask_green, :-1], c='#22a884')
blue_plot = ax[1,1].scatter(tsne_result[mask_blue, -1], tsne_result[mask_blue, :-1], c='#2a788e')
ax[1,1].set_title('Comparing Central and Northern California')
#Compare Southern California and Bay Area
purple_plot = ax[2,1].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
yellow_plot = ax[2,1].scatter(tsne_result[mask_yellow, -1], tsne_result[mask_yellow, :-1], c='#fde725')
ax[2,1].set_title('Comparing Southern California and the Bay Area')
# Compare Bay Area and Northern California
purple_plot = ax[3,1].scatter(tsne_result[mask_purple, -1], tsne_result[mask_purple, :-1], c='#440154')
green_plot = ax[3,1].scatter(tsne_result[mask_green, -1], tsne_result[mask_green, :-1], c='#22a884')
ax[3,1].set_title('Comparing Bay Area and Northern California')
plt.show()
#'dice', 'precomputed', 'cityblock', 'hamming', 'yule', 'rogerstanimoto', 'correlation', 'mahalanobis', 'manhattan', 'l2', 'russellrao', 'sokalmichener',
# 'nan_euclidean', 'canberra', 'matching', 'cosine', 'chebyshev', 'sokalsneath', 'wminkowski', 'haversine', 'minkowski', 'kulsinski', 'sqeuclidean',
# 'braycurtis', 'jaccard', 'seuclidean', 'l1', 'euclidean
fig, ax = plt.subplots(2, 2, figsize=(30, 20))
# DBscan plot
cluster_plot = ax[0,0].scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels, cmap='viridis')
ax[0,0].set_title('Clustering with DBscan')
plt.colorbar(cluster_plot, ax=ax[0,0])
# Form 2 clusters
mask_blue_greens = (labels >= 0) & (labels <=30000)
blue_greens_plot = ax[0,1].scatter(tsne_result[mask_blue_greens, -1], tsne_result[mask_blue_greens, :-1], c='#008B8B')
ax[0,1].set_title('Combining Bay Area, Central California, and Southern California')
# Plot other cluster
mask_yellow2 = (labels > 30000)
yellow2_plot = ax[1,1].scatter(tsne_result[mask_yellow2, -1], tsne_result[mask_yellow2, :-1], c='#fde725')
ax[1,1].set_title('Northern California')
plt.show()
kmeans = KMeans(n_clusters=2)
kmeans.fit(tsne_result)
labels = kmeans.labels_
plt.scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels)
plt.title("K-Means Clustering")
plt.show()
c:\Users\rdn91\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
model = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
model.fit(tsne_result)
labels = model.labels_
plt.scatter(tsne_result[:, -1], tsne_result[:, :-1], c=labels, cmap='viridis')
plt.title('Agglomerative Clustering')
plt.show()
Findings¶
Based on the scatter plot of the data after dimensionality reduction but before "clustering", I have some sort of intuition that there are kinda 2 clusters. For one cluster, we have Central California, Bay Area, and Southern California. The other cluster being primarily composed of Northern California. This result isn't really that surpising since we don't expect the census block groups (parts of a city) to be quite different from each other for different regions of California. Even though the cluster that contains the 3 California regions seems quite densely packed, there does seem to be some obvious patterns.
However, there is a major thing that needs to be remembered. There is an element of randomness for the feature selection and dimensionality reduction method. What this means is that each time we run the code, the results will not exactly be the same. However, I found that there were few cases where the general/global struture/shape of the results were totally different. My feature selection process is sort of like a pseudo-forward selection process using a correlation threshold.
Finding Patterns between Central California, Bay Area, and Southern California (Unsupervised + Supervised Learning)¶
After applying BorderlineSmote, I thought Central and Northern California wouldn't be that close knit as they are.
Central California seems to have a more diverse structure to it compared to Northern California.
Other than that, this results is not that surpising.
The data for Southern California are more dense and are "contained" inside the Bay Area structure.
However, I don't think I can say more in terms of quantitative patterns because it is not appropriate to give such meaning to data that has been through standardization, dimensionality reduction, and oversampling.
This plot goes with the general intuition that Southern California and Bay Area aren't that different in terms of what a typical "city" looks like besides the Bay Area being close to water.
This results aligns exactly as one might think. There is a reason why Bay Area is given its own region and not combined with another region.
Funny how I just said how the Bay Area should be given its own region but the plot does show that the census block groups for the Bay Area aren't quite as spread out. It's important to note that we only selected a certain subset of the available variables so the interpretation might change if I were to rerun all the code again.
Conclusion¶
I don't think that applying feature selection, dimensionality reduction, oversampling, and clustering will yield "significant" results when it comes to finding patterns involving census block groups from different California regions. However, just because I did not find a significant results, doesn't mean I should keep discard what the plots show. The clustering/plots affirm what we think about California regions and how different areas compare to one another. I think using various descriptive statistical techniques will bring about more impactful/insightful findings for this particular domain.
Finding Patterns using Descriptive Statistics¶
model_data.head()
| GEOID10 | GEOID20 | STATEFP | COUNTYFP | TRACTCE | BLKGRPCE | CSA | CSA_Name | CBSA | CBSA_Name | CBSA_POP | CBSA_EMP | CBSA_WRK | Ac_Total | Ac_Water | Ac_Land | Ac_Unpr | TotPop | CountHU | HH | P_WrkAge | AutoOwn0 | Pct_AO0 | AutoOwn1 | Pct_AO1 | AutoOwn2p | Pct_AO2p | Workers | R_LowWageWk | R_MedWageWk | R_HiWageWk | R_PCTLOWWAGE | TotEmp | E5_Ret | E5_Off | E5_Ind | E5_Svc | E5_Ent | E8_Ret | E8_off | E8_Ind | E8_Svc | E8_Ent | E8_Ed | E8_Hlth | E8_Pub | E_LowWageWk | E_MedWageWk | E_HiWageWk | E_PctLowWage | D1A | D1B | D1C | D1C5_RET | D1C5_OFF | D1C5_IND | D1C5_SVC | D1C5_ENT | D1C8_RET | D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_ENT | D1C8_ED | D1C8_HLTH | D1C8_PUB | D1D | D1_FLAG | D2A_JPHH | D2B_E5MIX | D2B_E5MIXA | D2B_E8MIX | D2B_E8MIXA | D2A_EPHHM | D2C_TRPMX1 | D2C_TRPMX2 | D2C_TRIPEQ | D2R_JOBPOP | D2R_WRKEMP | D2A_WRKEMP | D2C_WREMLX | D3A | D3AAO | D3AMM | D3APO | D3B | D3BAO | D3BMM3 | D3BMM4 | D3BPO3 | D3BPO4 | D4A | D4B025 | D4B050 | D4C | D4D | D4E | D5AR | D5AE | D5BR | D5BE | D5CR | D5CRI | D5CE | D5CEI | D5DR | D5DRI | D5DE | D5DEI | D2A_Ranked | D2B_Ranked | D3B_Ranked | D4A_Ranked | NatWalkInd | Households | Workers_1 | Residents | Drivers | White | Male | Lowwage | Medwage | Highwage | W_P_Lowwage | W_P_Medwage | W_P_Highwage | GasPrice | logd1a | logd1c | logd3aao | logd3apo | d4bo25 | d5dei_1 | logd4d | UPTpercap | B_C_constant | B_C_male | B_C_ld1c | B_C_drvmveh | B_C_ld1a | B_C_ld3apo | B_C_inc1 | B_C_gasp | B_N_constant | B_N_inc2 | B_N_inc3 | B_N_white | B_N_male | B_N_drvmveh | B_N_gasp | B_N_ld1a | B_N_ld1c | B_N_ld3aao | B_N_ld3apo | B_N_d4bo25 | B_N_d5dei | B_N_UPTpc | C_R_Households | C_R_Pop | C_R_Workers | C_R_Drivers | C_R_Vehicles | C_R_White | C_R_Male | C_R_Lowwage | C_R_Medwage | C_R_Highwage | C_R_DrmV | NonCom_VMT_Per_Worker | Com_VMT_Per_Worker | VMT_per_worker | VMT_tot_min | VMT_tot_max | VMT_tot_avg | GHG_per_worker | Annual_GHG | SLC_score | COUNTY_NAME | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60971513084 | 60971513084 | 6 | 97 | 151308 | 4 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 67.135587 | 0.0 | 67.135587 | 60.291730 | 396 | 169 | 131 | 0.490 | 0 | 0.000000 | 10 | 0.076336 | 121 | 0.923664 | 276 | 46 | 78 | 152 | 0.166667 | 30 | 4 | 0 | 16 | 10 | 0 | 4 | 0 | 16 | 9 | 0 | 0 | 1 | 0 | 6 | 9 | 15 | 0.200000 | 2.803038 | 6.568065 | 0.497581 | 0.066344 | 0.000000 | 0.265376 | 0.165860 | 0.000000 | 0.066344 | 0.000000 | 0.265376 | 0.149274 | 0.000000 | 0.000000 | 0.016586 | 0.0 | 3.300619 | 0.0 | 0.229008 | 0.883037 | 0.602767 | 0.777957 | 0.518638 | 0.477267 | 0.397559 | 0.395591 | 9.525833e-02 | 0.140845 | 0.196078 | 9.200000 | 2.746536e-04 | 22.454792 | 0.000000 | 0.064477 | 22.390315 | 149.409879 | 0.000000 | 0.000000 | 0.000000 | 181.125994 | 28.598841 | 615.57 | 0.00000 | 0.000000 | 0.67 | 6.387075 | 0.001692 | 20817.0 | 23238.0 | 3423.0 | 4467.0 | 0.002432 | 0.470739 | 0.002547 | 0.504417 | 0.001056 | 0.058551 | 0.001686 | 0.098464 | 9.0 | 9.0 | 17.0 | 15.0 | 13.666667 | 160 | 276 | 464 | 273.68 | 324 | 244 | 46 | 78 | 152 | 0.200000 | 0.300000 | 0.500000 | 342 | 1.335800 | 0.403851 | 0.00000 | 3.152322 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.586229 | 36.685037 | 41.271266 | 18.791349 | 72.049792 | 39.60906 | 36.772698 | 9560.901540 | 57.790885 | Sonoma County | Bay Area |
| 1 | 60971513092 | 60971513092 | 6 | 97 | 151309 | 2 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 570.037214 | 0.0 | 570.037214 | 471.335950 | 2560 | 896 | 869 | 0.593 | 8 | 0.009206 | 200 | 0.230150 | 661 | 0.760644 | 1406 | 289 | 358 | 759 | 0.205548 | 63 | 1 | 0 | 24 | 36 | 2 | 1 | 0 | 24 | 13 | 2 | 5 | 18 | 0 | 20 | 19 | 24 | 0.317460 | 1.900980 | 5.431370 | 0.133663 | 0.002122 | 0.000000 | 0.050919 | 0.076379 | 0.004243 | 0.002122 | 0.000000 | 0.050919 | 0.027581 | 0.004243 | 0.010608 | 0.038189 | 0.0 | 2.034642 | 0.0 | 0.072497 | 0.622319 | 0.536037 | 0.796767 | 0.686537 | 0.189938 | 0.234089 | 0.244020 | 1.481149e-03 | 0.048037 | 0.085773 | 22.317460 | 5.520070e-10 | 9.245028 | 1.434447 | 1.089662 | 6.720919 | 42.302361 | 4.490935 | 1.122734 | 2.245467 | 37.050213 | 14.595538 | 742.98 | 0.00000 | 0.000000 | 0.67 | 0.752232 | 0.000262 | 23024.0 | 26141.0 | 265.0 | 2111.0 | 0.002689 | 0.520646 | 0.002865 | 0.567431 | 0.000082 | 0.004533 | 0.000797 | 0.046532 | 3.0 | 15.0 | 9.0 | 14.0 | 10.666667 | 966 | 1406 | 2882 | 2041.60 | 1853 | 1286 | 289 | 358 | 759 | 0.317460 | 0.301587 | 0.380952 | 342 | 1.065048 | 0.125454 | 0.88972 | 2.043933 | 0 | 0 | 1 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 6.306880 | 45.213616 | 51.520496 | 18.791349 | 72.049792 | 39.60906 | 45.904762 | 11935.238176 | 38.546556 | Sonoma County | Bay Area |
| 2 | 60971513093 | 60971513093 | 6 | 97 | 151309 | 3 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 982.997406 | 0.0 | 982.997406 | 982.846330 | 1610 | 712 | 648 | 0.574 | 77 | 0.118827 | 187 | 0.288580 | 384 | 0.592593 | 665 | 149 | 171 | 345 | 0.224060 | 282 | 0 | 0 | 31 | 245 | 6 | 0 | 0 | 31 | 5 | 6 | 61 | 179 | 0 | 83 | 104 | 95 | 0.294326 | 0.724427 | 1.638099 | 0.286922 | 0.000000 | 0.000000 | 0.031541 | 0.249276 | 0.006105 | 0.000000 | 0.000000 | 0.031541 | 0.005087 | 0.006105 | 0.062065 | 0.182124 | 0.0 | 1.011348 | 0.0 | 0.435185 | 0.406720 | 0.277630 | 0.631164 | 0.488506 | 0.540334 | 0.441643 | 0.477021 | 9.053520e-01 | 0.298097 | 0.595565 | 2.358156 | 2.571345e-01 | 6.367728 | 0.795925 | 0.849893 | 4.721909 | 21.711227 | 1.302140 | 4.557489 | 0.000000 | 22.136376 | 3.906419 | 1098.38 | 0.00000 | 0.000000 | 0.67 | 0.436217 | 0.000416 | 22544.0 | 25770.0 | 368.0 | 1478.0 | 0.002633 | 0.509792 | 0.002824 | 0.559378 | 0.000114 | 0.006295 | 0.000558 | 0.032579 | 11.0 | 8.0 | 7.0 | 13.0 | 9.833333 | 660 | 665 | 1722 | 1156.32 | 1283 | 647 | 149 | 171 | 345 | 0.294326 | 0.368794 | 0.336879 | 342 | 0.544895 | 0.252253 | 0.58552 | 1.744303 | 0 | 0 | 0 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 7.067722 | 45.780375 | 52.848097 | 18.791349 | 72.049792 | 39.60906 | 47.087654 | 12242.790059 | 36.053805 | Sonoma County | Bay Area |
| 3 | 60971513101 | 60971513101 | 6 | 97 | 151310 | 1 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 51.480814 | 0.0 | 51.480814 | 51.080456 | 506 | 199 | 182 | 0.709 | 0 | 0.000000 | 35 | 0.192308 | 147 | 0.807692 | 268 | 63 | 73 | 132 | 0.235075 | 142 | 14 | 0 | 8 | 19 | 101 | 14 | 0 | 8 | 13 | 101 | 0 | 6 | 0 | 96 | 37 | 9 | 0.676056 | 3.895815 | 9.905941 | 2.779928 | 0.274077 | 0.000000 | 0.156616 | 0.371962 | 1.977273 | 0.274077 | 0.000000 | 0.156616 | 0.254500 | 1.977273 | 0.000000 | 0.117462 | 0.0 | 6.675743 | 0.0 | 0.780220 | 0.650603 | 0.560399 | 0.612247 | 0.473864 | 0.671536 | 0.565885 | 0.621614 | 5.384277e-01 | 0.438272 | 0.692683 | 1.887324 | 4.117562e-01 | 27.510900 | 0.000000 | 9.572344 | 17.938556 | 116.063433 | 0.000000 | 12.431816 | 24.863632 | 87.022711 | 24.863632 | 580.70 | 0.00000 | 0.479764 | 2.00 | 24.863632 | 0.003953 | 21597.0 | 25470.0 | 185.0 | 1195.0 | 0.002523 | 0.488377 | 0.002791 | 0.552866 | 0.000057 | 0.003164 | 0.000451 | 0.026341 | 15.0 | 7.0 | 15.0 | 15.0 | 13.666667 | 200 | 268 | 552 | 383.68 | 317 | 190 | 63 | 73 | 132 | 0.676056 | 0.260563 | 0.063380 | 342 | 1.588381 | 1.329705 | 0.00000 | 2.941200 | 0 | 0 | 3 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 3.675845 | 24.646966 | 28.322812 | 18.791349 | 72.049792 | 39.60906 | 25.235625 | 6561.262585 | 82.103377 | Sonoma County | Bay Area |
| 4 | 60971513102 | 60971513102 | 6 | 97 | 151310 | 2 | 488.0 | San Jose-San Francisco-Oakland, CA | 42220.0 | Santa Rosa-Petaluma, CA | 501317.0 | 201244.0 | 219300.0 | 94.734433 | 0.0 | 94.734433 | 89.936999 | 1944 | 700 | 674 | 0.653 | 68 | 0.100890 | 97 | 0.143917 | 509 | 0.755193 | 502 | 93 | 166 | 243 | 0.185259 | 18 | 1 | 1 | 3 | 13 | 0 | 1 | 1 | 3 | 1 | 0 | 0 | 12 | 0 | 11 | 5 | 2 | 0.611111 | 7.783226 | 21.615131 | 0.200140 | 0.011119 | 0.011119 | 0.033357 | 0.144546 | 0.000000 | 0.011119 | 0.011119 | 0.033357 | 0.011119 | 0.000000 | 0.000000 | 0.133427 | 0.0 | 7.983366 | 0.0 | 0.026706 | 0.616612 | 0.531121 | 0.652815 | 0.505263 | 0.088743 | 0.129022 | 0.139861 | 1.035771e-07 | 0.018349 | 0.069231 | 27.888889 | 2.100409e-12 | 21.003141 | 0.000000 | 4.051075 | 16.952065 | 92.364304 | 0.000000 | 6.755727 | 13.511455 | 101.335910 | 6.755727 | 692.02 | 0.15082 | 0.597189 | 2.00 | 13.511455 | 0.001029 | 21512.0 | 25326.0 | 162.0 | 494.0 | 0.002513 | 0.486455 | 0.002775 | 0.549741 | 0.000050 | 0.002771 | 0.000186 | 0.010889 | 1.0 | 8.0 | 14.0 | 14.0 | 10.833333 | 636 | 502 | 1741 | 1295.36 | 1217 | 915 | 93 | 166 | 243 | 0.611111 | 0.277778 | 0.111111 | 342 | 2.172844 | 0.182438 | 0.00000 | 2.887705 | 0 | 0 | 3 | 8 | 0.119722 | 0.113615 | -0.174782 | -0.245813 | 0.099881 | -0.238419 | -0.654242 | 0.01211 | 1.963416 | 0.085305 | 0.132144 | -0.164703 | 0.082742 | -0.174723 | 0.000804 | -0.180891 | -0.180891 | 0.037875 | -0.170452 | -0.234661 | -0.235263 | -0.002019 | 189374 | 499772 | 219301 | 358126.56 | 355082 | 0.632332 | 0.488313 | 0.209589 | 0.301544 | 0.488862 | 0.016077 | 4.009743 | 33.743783 | 37.753526 | 18.791349 | 72.049792 | 39.60906 | 33.638392 | 8745.981842 | 64.395923 | Sonoma County | Bay Area |
model_data['COUNTY_NAME'].value_counts()
# Check negative values and nulls again
numeric_cols = model_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(model_data.columns[model_data.isna().any()].tolist())
print(model_data[numeric_cols].columns[(model_data[numeric_cols] < 0 ).any()].tolist())
print(model_data[numeric_cols].columns[(model_data[numeric_cols] < 0 ).all()].tolist())
['CSA', 'CSA_Name', 'CBSA', 'CBSA_Name'] ['B_C_male', 'B_C_ld1c', 'B_C_drvmveh', 'B_C_ld1a', 'B_C_ld3apo', 'B_C_inc1', 'B_C_gasp', 'B_N_inc2', 'B_N_inc3', 'B_N_white', 'B_N_drvmveh', 'B_N_gasp', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo', 'B_N_d4bo25', 'B_N_d5dei', 'B_N_UPTpc', 'C_R_DrmV'] ['B_N_white', 'B_N_drvmveh', 'B_N_ld1a', 'B_N_ld1c', 'B_N_ld3apo', 'B_N_d4bo25']
Let's find some patterns using a tier system (jobs, traffic, public transportation, population, walkability)¶
# Job Columns
job_cols = ['D1C8_OFF', 'D1C8_IND', 'D1C8_SVC', 'D1C8_HLTH', 'D1C8_PUB'] # office, industry, service, education, healthcare, public administration
model_data['D1C8_SUM'] = model_data[job_cols].sum(axis=1)
model_data['D1C8_SUM'] = model_data['D1C8_SUM'] / model_data['Ac_Unpr']
model_data[job_cols + ['D1C8_SUM', 'D2C_WREMLX']]
| D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_HLTH | D1C8_PUB | D1C8_SUM | D2C_WREMLX | |
|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 0.265376 | 0.149274 | 0.016586 | 0.0 | 0.007152 | 2.746536e-04 |
| 1 | 0.000000 | 0.050919 | 0.027581 | 0.038189 | 0.0 | 0.000248 | 5.520070e-10 |
| 2 | 0.000000 | 0.031541 | 0.005087 | 0.182124 | 0.0 | 0.000223 | 2.571345e-01 |
| 3 | 0.000000 | 0.156616 | 0.254500 | 0.117462 | 0.0 | 0.010348 | 4.117562e-01 |
| 4 | 0.011119 | 0.033357 | 0.011119 | 0.133427 | 0.0 | 0.002102 | 2.100409e-12 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 13466 | 0.613009 | 2.932155 | 1.277459 | 0.458685 | 0.0 | 0.022640 | 4.491367e-01 |
| 13467 | 0.565495 | 6.942588 | 2.201728 | 5.769423 | 0.0 | 0.013323 | 3.853825e-01 |
| 13468 | 0.000000 | 0.299218 | 0.000000 | 0.089765 | 0.0 | 0.005820 | 1.268813e-02 |
| 13469 | 0.034125 | 0.145032 | 0.059719 | 0.034125 | 0.0 | 0.002329 | 1.302689e-08 |
| 13470 | 0.012383 | 0.458157 | 0.099061 | 0.037148 | 0.0 | 0.007513 | 2.520925e-04 |
13471 rows × 7 columns
model_data['CSA'].value_counts()
CSA 348.0 8152 488.0 2961 472.0 840 260.0 461 454.0 63 Name: count, dtype: int64
# Apply normalization/standardization
scaler = MinMaxScaler().set_output(transform="pandas")
# Multiply sum and mix columns
model_data['job_score'] = model_data['D1C8_SUM'].mul(1 - model_data['D2C_WREMLX']) # we want low values since it is workers/total employment (number of jobs)
model_data[['job_score']] = scaler.fit_transform(model_data[['job_score']])
model_data[job_cols + ['D1C8_SUM', 'D2C_WREMLX', 'job_score', 'COUNTY_NAME', 'target']].sort_values('job_score', ascending=False)
| D1C8_OFF | D1C8_IND | D1C8_SVC | D1C8_HLTH | D1C8_PUB | D1C8_SUM | D2C_WREMLX | job_score | COUNTY_NAME | target | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7228 | 124.264723 | 6.809026 | 24.580584 | 326.152342 | 251.968004 | 24.981454 | 0.375348 | 1.000000 | San Francisco County | Bay Area |
| 2852 | 74.388057 | 14.742905 | 151.919272 | 18.335085 | 0.000000 | 19.411640 | 0.459727 | 0.672079 | San Francisco County | Bay Area |
| 6062 | 9.061000 | 1.057117 | 381.921143 | 28.957445 | 0.000000 | 15.894379 | 0.388301 | 0.623054 | San Francisco County | Bay Area |
| 2851 | 324.996452 | 57.937571 | 298.387744 | 10.041895 | 0.000000 | 10.664518 | 0.373356 | 0.428259 | San Francisco County | Bay Area |
| 9024 | 3.677608 | 0.000000 | 1.838804 | 19.964156 | 0.000000 | 6.693395 | 0.137501 | 0.369956 | San Francisco County | Bay Area |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2331 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Sacramento County | Central California |
| 7272 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Sacramento County | Central California |
| 6323 | 0.168078 | 0.077574 | 0.155149 | 2.049256 | 0.000000 | 0.015838 | 1.000000 | 0.000000 | Stanislaus County | Central California |
| 11444 | 0.000000 | 0.108874 | 0.038426 | 0.563582 | 0.000000 | 0.004553 | 1.000000 | 0.000000 | Fresno County | Central California |
| 1297 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Santa Clara County | Bay Area |
13471 rows × 10 columns
model_data['D2C_WREMLX'].describe()
count 13471.000000 mean 0.288830 std 0.311301 min 0.000000 25% 0.002644 50% 0.149160 75% 0.535116 max 1.000000 Name: D2C_WREMLX, dtype: float64
# Traffic columns
traffic_cols = ['D2C_TRIPEQ', 'D3A', 'D3B', 'D5AR','NatWalkInd'] # distance matric from job to home and vice versa, total road network density, street road network density, jobs within 45 minutes travel time, walkability index (higher is better)
model_data['D3A'] = 1 - model_data['D3A']
model_data['D3B'] = 1 - model_data['D3B']
model_data[traffic_cols] = scaler.fit_transform(model_data[traffic_cols])
model_data['traffic_score'] = model_data[traffic_cols].sum(axis=1) / len(traffic_cols)
model_data[traffic_cols + ['traffic_score']].sort_values('traffic_score', ascending=False)
| D2C_TRIPEQ | D3A | D3B | D5AR | NatWalkInd | traffic_score | |
|---|---|---|---|---|---|---|
| 227 | 0.976289 | 0.789169 | 0.890587 | 0.809755 | 0.813953 | 0.855951 |
| 2719 | 0.921273 | 0.669158 | 0.954830 | 0.995002 | 0.651163 | 0.838285 |
| 6657 | 0.993728 | 0.699506 | 0.906954 | 0.757107 | 0.790698 | 0.829598 |
| 7594 | 0.967988 | 0.669736 | 0.857759 | 0.804218 | 0.813953 | 0.822731 |
| 5785 | 0.956130 | 0.677275 | 0.875988 | 0.741041 | 0.860465 | 0.822180 |
| ... | ... | ... | ... | ... | ... | ... |
| 10174 | 0.057621 | 0.199694 | 0.210870 | 0.345388 | 0.639535 | 0.290622 |
| 1528 | 0.035523 | 0.158313 | 0.267526 | 0.271861 | 0.639535 | 0.274551 |
| 8902 | 0.031081 | 0.197573 | 0.308331 | 0.160787 | 0.662791 | 0.272112 |
| 2458 | 0.038702 | 0.144209 | 0.000000 | 0.091349 | 0.744186 | 0.203689 |
| 13117 | 0.002439 | 0.000000 | 0.068731 | 0.207854 | 0.581395 | 0.172084 |
13471 rows × 6 columns
model_data['combined_score'] = (model_data['job_score'] + model_data['traffic_score']) / 2
model_data[['job_score', 'traffic_score', 'combined_score', 'COUNTY_NAME', 'target']].sort_values('combined_score', ascending=False)
| job_score | traffic_score | combined_score | COUNTY_NAME | target | |
|---|---|---|---|---|---|
| 7228 | 1.000000 | 0.589772 | 0.794886 | San Francisco County | Bay Area |
| 2852 | 0.672079 | 0.602886 | 0.637482 | San Francisco County | Bay Area |
| 6062 | 0.623054 | 0.420562 | 0.521808 | San Francisco County | Bay Area |
| 9024 | 0.369956 | 0.661863 | 0.515909 | San Francisco County | Bay Area |
| 201 | 0.305844 | 0.709597 | 0.507720 | Los Angeles County | Southern California |
| ... | ... | ... | ... | ... | ... |
| 10174 | 0.003773 | 0.290622 | 0.147197 | Los Angeles County | Southern California |
| 1528 | 0.006105 | 0.274551 | 0.140328 | San Francisco County | Bay Area |
| 8902 | 0.000831 | 0.272112 | 0.136472 | Santa Clara County | Bay Area |
| 2458 | 0.000694 | 0.203689 | 0.102192 | Santa Clara County | Bay Area |
| 13117 | 0.001731 | 0.172084 | 0.086907 | Santa Clara County | Bay Area |
13471 rows × 5 columns
model_data.loc[model_data['target'] == 'Southern California'].groupby('COUNTY_NAME')['TotPop'].mean().sort_values(ascending=False)
COUNTY_NAME Riverside County 2244.340094 San Bernardino County 1955.863448 Kern County 1782.683636 San Diego County 1745.905488 Orange County 1735.675289 Santa Barbara County 1611.385321 San Luis Obispo County 1592.247525 Los Angeles County 1562.340418 Imperial County 1473.333333 Name: TotPop, dtype: float64
model_data.loc[model_data['target'] == 'Southern California'].groupby('COUNTY_NAME')['job_score'].mean().sort_values(ascending=False)
COUNTY_NAME Los Angeles County 0.002930 Santa Barbara County 0.001034 Orange County 0.000894 San Diego County 0.000680 San Bernardino County 0.000369 Riverside County 0.000317 San Luis Obispo County 0.000307 Kern County 0.000265 Imperial County 0.000107 Name: job_score, dtype: float64
model_data.loc[model_data['target'] == 'Southern California'].groupby('COUNTY_NAME')['traffic_score'].mean().sort_values(ascending=False)
COUNTY_NAME Los Angeles County 0.642654 Orange County 0.584887 San Luis Obispo County 0.557148 San Bernardino County 0.552887 Riverside County 0.551792 Santa Barbara County 0.549674 Kern County 0.543568 San Diego County 0.542136 Imperial County 0.531919 Name: traffic_score, dtype: float64
model_data.loc[model_data['target'] == 'Southern California'].groupby('COUNTY_NAME')['combined_score'].mean().sort_values(ascending=False)
COUNTY_NAME Los Angeles County 0.322792 Orange County 0.292890 San Luis Obispo County 0.278727 San Bernardino County 0.276628 Riverside County 0.276054 Santa Barbara County 0.275354 Kern County 0.271917 San Diego County 0.271408 Imperial County 0.266013 Name: combined_score, dtype: float64
model_data.loc[model_data['target'] == 'Southern California',['job_score', 'traffic_score', 'combined_score', 'COUNTY_NAME', 'target', 'GEOID20']].sort_values('combined_score', ascending=False)
| job_score | traffic_score | combined_score | COUNTY_NAME | target | GEOID20 | |
|---|---|---|---|---|---|---|
| 201 | 3.058435e-01 | 0.709597 | 0.507720 | Los Angeles County | Southern California | 60372124101 |
| 10634 | 1.606575e-01 | 0.804137 | 0.482397 | Los Angeles County | Southern California | 60372073011 |
| 4324 | 2.253846e-01 | 0.734779 | 0.480082 | Los Angeles County | Southern California | 60372121021 |
| 2725 | 2.763699e-01 | 0.606381 | 0.441375 | Los Angeles County | Southern California | 60372679023 |
| 5750 | 9.398291e-02 | 0.784949 | 0.439466 | Los Angeles County | Southern California | 60372073021 |
| ... | ... | ... | ... | ... | ... | ... |
| 9657 | 6.970668e-04 | 0.351461 | 0.176079 | Orange County | Southern California | 60590888024 |
| 1900 | 1.798097e-03 | 0.349198 | 0.175498 | Los Angeles County | Southern California | 60376210045 |
| 4523 | 4.299220e-07 | 0.339721 | 0.169861 | Santa Barbara County | Southern California | 60830029221 |
| 9487 | 7.744602e-04 | 0.318034 | 0.159404 | Orange County | Southern California | 60590635003 |
| 10174 | 3.773260e-03 | 0.290622 | 0.147197 | Los Angeles County | Southern California | 60376209042 |
8968 rows × 6 columns
score_cols = model_data[['job_score', 'traffic_score', 'combined_score']]
score_cols.head()
| job_score | traffic_score | combined_score | |
|---|---|---|---|
| 0 | 0.000458 | 0.460563 | 0.230511 |
| 1 | 0.000016 | 0.448862 | 0.224439 |
| 2 | 0.000011 | 0.628081 | 0.314046 |
| 3 | 0.000390 | 0.541434 | 0.270912 |
| 4 | 0.000135 | 0.413839 | 0.206987 |
plot_cols(model_data, ['job_score', 'traffic_score', 'combined_score'], 3, 'histplot')
#california_data = california_data.reset_index().drop('index', axis=1)
california_data_with_scores = pd.concat([california_data,score_cols], axis=1)
california_data_with_scores.loc[california_data_with_scores['target'] == 'Southern California',['job_score', 'traffic_score', 'combined_score', 'COUNTY_NAME', 'target', 'GEOID20', 'CSA']].sort_values('combined_score', ascending=False)
| job_score | traffic_score | combined_score | COUNTY_NAME | target | GEOID20 | CSA | |
|---|---|---|---|---|---|---|---|
| 201 | 3.058435e-01 | 0.709597 | 0.507720 | Los Angeles County | Southern California | 060372124101 | 348 |
| 10634 | 1.606575e-01 | 0.804137 | 0.482397 | Los Angeles County | Southern California | 060372073011 | 348 |
| 4324 | 2.253846e-01 | 0.734779 | 0.480082 | Los Angeles County | Southern California | 060372121021 | 348 |
| 2725 | 2.763699e-01 | 0.606381 | 0.441375 | Los Angeles County | Southern California | 060372679023 | 348 |
| 5750 | 9.398291e-02 | 0.784949 | 0.439466 | Los Angeles County | Southern California | 060372073021 | 348 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 9657 | 6.970668e-04 | 0.351461 | 0.176079 | Orange County | Southern California | 060590888024 | 348 |
| 1900 | 1.798097e-03 | 0.349198 | 0.175498 | Los Angeles County | Southern California | 060376210045 | 348 |
| 4523 | 4.299220e-07 | 0.339721 | 0.169861 | Santa Barbara County | Southern California | 060830029221 | None |
| 9487 | 7.744602e-04 | 0.318034 | 0.159404 | Orange County | Southern California | 060590635003 | 348 |
| 10174 | 3.773260e-03 | 0.290622 | 0.147197 | Los Angeles County | Southern California | 060376209042 | 348 |
8968 rows × 7 columns